From a5f641b4b01adf44d166b4246bc3eb3b135ec215 Mon Sep 17 00:00:00 2001
From: John Sharratt's Shared Account <johnathan.sharratt@gmail.com>
Date: Wed, 17 Aug 2022 17:45:48 +1000
Subject: [PATCH 01/24] Implemented shared memory for Wasmer in preparation for
 multithreading

Fixed linter

Fixed clippy

Cleaned up some merge leftover
---
 lib/api/src/js/mod.rs                         |   1 -
 lib/api/src/sys/imports.rs                    |  33 ++++-
 lib/api/src/sys/mod.rs                        |   4 +-
 lib/cli/src/commands/run/wasi.rs              |   3 +-
 .../src/translator/code_translator.rs         |  34 ++++-
 lib/compiler/src/translator/environ.rs        |   8 +-
 lib/vm/src/instance/mod.rs                    |   5 +-
 lib/vm/src/lib.rs                             |   2 +-
 lib/vm/src/memory.rs                          | 124 ++++++++++++++++--
 lib/vm/src/store.rs                           |   5 +
 10 files changed, 190 insertions(+), 29 deletions(-)
diff --git a/lib/api/src/js/mod.rs b/lib/api/src/js/mod.rs
index a172dffcf7d..13cc7a2504b 100644
--- a/lib/api/src/js/mod.rs
+++ b/lib/api/src/js/mod.rs
@@ -73,7 +73,6 @@ pub use crate::js::value::Value as Val;
 
 pub mod vm {
     //! The `vm` module re-exports wasmer-vm types.
-
     pub use crate::js::export::VMMemory;
 }
 
diff --git a/lib/api/src/sys/imports.rs b/lib/api/src/sys/imports.rs
index c9a9f22b917..b6864e5ee2f 100644
--- a/lib/api/src/sys/imports.rs
+++ b/lib/api/src/sys/imports.rs
@@ -1,11 +1,12 @@
 //! The import module contains the implementation data structures and helper functions used to
 //! manipulate and access a wasm module's imports including memories, tables, globals, and
 //! functions.
-use crate::{Exports, Extern, Module};
+use crate::{AsStoreMut, Exports, Extern, Memory, Module};
 use std::collections::HashMap;
 use std::fmt;
 use wasmer_compiler::LinkError;
 use wasmer_types::ImportError;
+use wasmer_vm::VMSharedMemory;
 
 /// All of the import data used when instantiating.
 ///
@@ -111,6 +112,36 @@ impl Imports {
             .insert((ns.to_string(), name.to_string()), val.into());
     }
 
+    /// Imports (any) shared memory into the imports.
+    /// (if the module does not import memory then this function is ignored)
+    pub fn import_shared_memory(
+        &mut self,
+        module: &Module,
+        store: &mut impl AsStoreMut,
+    ) -> Option<VMSharedMemory> {
+        // Determine if shared memory needs to be created and imported
+        let shared_memory = module
+            .imports()
+            .memories()
+            .next()
+            .map(|a| *a.ty())
+            .map(|ty| {
+                let style = store.as_store_ref().tunables().memory_style(&ty);
+                VMSharedMemory::new(&ty, &style).unwrap()
+            });
+
+        if let Some(memory) = shared_memory {
+            self.define(
+                "env",
+                "memory",
+                Memory::new_from_existing(store, memory.clone().into()),
+            );
+            Some(memory)
+        } else {
+            None
+        }
+    }
+
     /// Returns the contents of a namespace as an `Exports`.
     ///
     /// Returns `None` if the namespace doesn't exist.
diff --git a/lib/api/src/sys/mod.rs b/lib/api/src/sys/mod.rs
index d24be112dcb..c8408b66219 100644
--- a/lib/api/src/sys/mod.rs
+++ b/lib/api/src/sys/mod.rs
@@ -57,8 +57,8 @@ pub mod vm {
     //! The `vm` module re-exports wasmer-vm types.
 
     pub use wasmer_vm::{
-        MemoryError, MemoryStyle, TableStyle, VMExtern, VMMemory, VMMemoryDefinition, VMTable,
-        VMTableDefinition,
+        MemoryError, MemoryStyle, TableStyle, VMExtern, VMMemory, VMMemoryDefinition,
+        VMOwnedMemory, VMSharedMemory, VMTable, VMTableDefinition,
     };
 }
 
diff --git a/lib/cli/src/commands/run/wasi.rs b/lib/cli/src/commands/run/wasi.rs
index ffc70c42036..d913e5119c7 100644
--- a/lib/cli/src/commands/run/wasi.rs
+++ b/lib/cli/src/commands/run/wasi.rs
@@ -104,7 +104,8 @@ impl Wasi {
             is_wasix_module(module),
             std::sync::atomic::Ordering::Release,
         );
-        let import_object = import_object_for_all_wasi_versions(store, &wasi_env.env);
+        let mut import_object = import_object_for_all_wasi_versions(store, &wasi_env.env);
+        import_object.import_shared_memory(module, store);
         let instance = Instance::new(store, module, &import_object)?;
         let memory = instance.exports.get_memory("memory")?;
         wasi_env.data_mut(store).set_memory(memory.clone());
diff --git a/lib/compiler-cranelift/src/translator/code_translator.rs b/lib/compiler-cranelift/src/translator/code_translator.rs
index c750d6c3231..e38640fb3e0 100644
--- a/lib/compiler-cranelift/src/translator/code_translator.rs
+++ b/lib/compiler-cranelift/src/translator/code_translator.rs
@@ -1063,15 +1063,26 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             assert!(builder.func.dfg.value_type(expected) == implied_ty);
             // `fn translate_atomic_wait` can inspect the type of `expected` to figure out what
             // code it needs to generate, if it wants.
-            let res = environ.translate_atomic_wait(
+            match environ.translate_atomic_wait(
                 builder.cursor(),
                 heap_index,
                 heap,
                 addr,
                 expected,
                 timeout,
-            )?;
-            state.push1(res);
+            ) {
+                Ok(res) => {
+                    state.push1(res);
+                }
+                Err(wasmer_types::WasmError::Unsupported(_err)) => {
+                    // If multiple threads hit a mutex then the function will fail
+                    builder.ins().trap(ir::TrapCode::UnreachableCodeReached);
+                    state.reachable = false;
+                }
+                Err(err) => {
+                    return Err(err);
+                }
+            };
         }
         Operator::MemoryAtomicNotify { memarg } => {
             let heap_index = MemoryIndex::from_u32(memarg.memory);
@@ -1079,9 +1090,20 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let count = state.pop1(); // 32 (fixed)
             let addr = state.pop1(); // 32 (fixed)
             let addr = fold_atomic_mem_addr(addr, memarg, I32, builder);
-            let res =
-                environ.translate_atomic_notify(builder.cursor(), heap_index, heap, addr, count)?;
-            state.push1(res);
+            match environ.translate_atomic_notify(builder.cursor(), heap_index, heap, addr, count) {
+                Ok(res) => {
+                    state.push1(res);
+                }
+                Err(wasmer_types::WasmError::Unsupported(_err)) => {
+                    // Simple return a zero as this function is needed for the __wasi_init_memory function
+                    // but the equivalent notify.wait will not be called (as only one thread calls __start)
+                    // hence these atomic operations are not needed
+                    state.push1(builder.ins().iconst(I32, i64::from(0)));
+                }
+                Err(err) => {
+                    return Err(err);
+                }
+            };
         }
         Operator::I32AtomicLoad { memarg } => {
             translate_atomic_load(I32, I32, memarg, builder, state, environ)?
diff --git a/lib/compiler/src/translator/environ.rs b/lib/compiler/src/translator/environ.rs
index e172d92b063..fc515a9c73b 100644
--- a/lib/compiler/src/translator/environ.rs
+++ b/lib/compiler/src/translator/environ.rs
@@ -1,7 +1,6 @@
 // This file contains code from external sources.
 // Attributions: https://github.com/wasmerio/wasmer/blob/master/ATTRIBUTIONS.md
 use super::state::ModuleTranslationState;
-use crate::lib::std::borrow::ToOwned;
 use crate::lib::std::string::ToString;
 use crate::lib::std::{boxed::Box, string::String, vec::Vec};
 use crate::translate_module;
@@ -9,13 +8,13 @@ use crate::wasmparser::{Operator, Range, Type};
 use std::convert::{TryFrom, TryInto};
 use wasmer_types::entity::PrimaryMap;
 use wasmer_types::FunctionType;
+use wasmer_types::WasmResult;
 use wasmer_types::{
     CustomSectionIndex, DataIndex, DataInitializer, DataInitializerLocation, ElemIndex,
     ExportIndex, FunctionIndex, GlobalIndex, GlobalInit, GlobalType, ImportIndex,
     LocalFunctionIndex, MemoryIndex, MemoryType, ModuleInfo, SignatureIndex, TableIndex,
     TableInitializer, TableType,
 };
-use wasmer_types::{WasmError, WasmResult};
 
 /// Contains function data: bytecode and its offset in the module.
 #[derive(Hash)]
@@ -254,11 +253,6 @@ impl<'data> ModuleEnvironment<'data> {
     }
 
     pub(crate) fn declare_memory(&mut self, memory: MemoryType) -> WasmResult<()> {
-        if memory.shared {
-            return Err(WasmError::Unsupported(
-                "shared memories are not supported yet".to_owned(),
-            ));
-        }
         self.module.memories.push(memory);
         Ok(())
     }
diff --git a/lib/vm/src/instance/mod.rs b/lib/vm/src/instance/mod.rs
index d6b6e2341cd..dde412ff343 100644
--- a/lib/vm/src/instance/mod.rs
+++ b/lib/vm/src/instance/mod.rs
@@ -16,10 +16,11 @@ use crate::trap::{catch_traps, Trap, TrapCode};
 use crate::vmcontext::{
     memory_copy, memory_fill, VMBuiltinFunctionsArray, VMCallerCheckedAnyfunc, VMContext,
     VMFunctionContext, VMFunctionImport, VMFunctionKind, VMGlobalDefinition, VMGlobalImport,
-    VMMemoryImport, VMSharedSignatureIndex, VMTableDefinition, VMTableImport, VMTrampoline,
+    VMMemoryDefinition, VMMemoryImport, VMSharedSignatureIndex, VMTableDefinition, VMTableImport,
+    VMTrampoline,
 };
+use crate::LinearMemory;
 use crate::{FunctionBodyPtr, MaybeInstanceOwned, TrapHandlerFn, VMFunctionBody};
-use crate::{LinearMemory, VMMemoryDefinition};
 use crate::{VMFuncRef, VMFunction, VMGlobal, VMMemory, VMTable};
 pub use allocator::InstanceAllocator;
 use memoffset::offset_of;
diff --git a/lib/vm/src/lib.rs b/lib/vm/src/lib.rs
index 1aaae7b52b8..8d5602fefed 100644
--- a/lib/vm/src/lib.rs
+++ b/lib/vm/src/lib.rs
@@ -45,7 +45,7 @@ pub use crate::function_env::VMFunctionEnvironment;
 pub use crate::global::*;
 pub use crate::imports::Imports;
 pub use crate::instance::{InstanceAllocator, InstanceHandle};
-pub use crate::memory::{initialize_memory_with_data, LinearMemory, VMMemory};
+pub use crate::memory::{initialize_memory_with_data, LinearMemory, VMMemory, VMOwnedMemory, VMSharedMemory};
 pub use crate::mmap::Mmap;
 pub use crate::probestack::PROBESTACK;
 pub use crate::sig_registry::SignatureRegistry;
diff --git a/lib/vm/src/memory.rs b/lib/vm/src/memory.rs
index 5a7ea5dad0d..99d69d9e363 100644
--- a/lib/vm/src/memory.rs
+++ b/lib/vm/src/memory.rs
@@ -12,6 +12,7 @@ use std::cell::UnsafeCell;
 use std::convert::TryInto;
 use std::ptr::NonNull;
 use std::slice;
+use std::sync::{Arc, RwLock};
 use wasmer_types::{Bytes, MemoryError, MemoryStyle, MemoryType, Pages};
 
 // The memory mapped area
@@ -156,6 +157,18 @@ pub struct VMOwnedMemory {
 unsafe impl Send for VMOwnedMemory {}
 unsafe impl Sync for VMOwnedMemory {}
 
+/// A shared linear memory instance.
+#[derive(Debug, Clone)]
+pub struct VMSharedMemory {
+    // The underlying allocation.
+    mmap: Arc<RwLock<WasmMmap>>,
+    // Configuration of this memory
+    config: VMMemoryConfig,
+}
+
+unsafe impl Send for VMSharedMemory {}
+unsafe impl Sync for VMSharedMemory {}
+
 impl VMOwnedMemory {
     /// Create a new linear memory instance with specified minimum and maximum number of wasm pages.
     ///
@@ -259,6 +272,16 @@ impl VMOwnedMemory {
     }
 }
 
+impl VMOwnedMemory {
+    /// Converts this owned memory into shared memory
+    pub fn to_shared(self) -> VMSharedMemory {
+        VMSharedMemory {
+            mmap: Arc::new(RwLock::new(self.mmap)),
+            config: self.config,
+        }
+    }
+}
+
 impl LinearMemory for VMOwnedMemory {
     /// Returns the type for this memory.
     fn ty(&self) -> MemoryType {
@@ -295,12 +318,85 @@ impl LinearMemory for VMOwnedMemory {
     }
 }
 
+impl VMSharedMemory {
+    /// Create a new linear memory instance with specified minimum and maximum number of wasm pages.
+    ///
+    /// This creates a `Memory` with owned metadata: this can be used to create a memory
+    /// that will be imported into Wasm modules.
+    pub fn new(memory: &MemoryType, style: &MemoryStyle) -> Result<Self, MemoryError> {
+        Ok(VMOwnedMemory::new(memory, style)?.to_shared())
+    }
+
+    /// Create a new linear memory instance with specified minimum and maximum number of wasm pages.
+    ///
+    /// This creates a `Memory` with metadata owned by a VM, pointed to by
+    /// `vm_memory_location`: this can be used to create a local memory.
+    ///
+    /// # Safety
+    /// - `vm_memory_location` must point to a valid location in VM memory.
+    pub unsafe fn from_definition(
+        memory: &MemoryType,
+        style: &MemoryStyle,
+        vm_memory_location: NonNull<VMMemoryDefinition>,
+    ) -> Result<Self, MemoryError> {
+        Ok(VMOwnedMemory::from_definition(memory, style, vm_memory_location)?.to_shared())
+    }
+}
+
+impl LinearMemory for VMSharedMemory {
+    /// Returns the type for this memory.
+    fn ty(&self) -> MemoryType {
+        let minimum = {
+            let guard = self.mmap.read().unwrap();
+            guard.size()
+        };
+        self.config.ty(minimum)
+    }
+
+    /// Returns the size of hte memory in pages
+    fn size(&self) -> Pages {
+        let guard = self.mmap.read().unwrap();
+        guard.size()
+    }
+
+    /// Returns the memory style for this memory.
+    fn style(&self) -> MemoryStyle {
+        self.config.style()
+    }
+
+    /// Grow memory by the specified amount of wasm pages.
+    ///
+    /// Returns `None` if memory can't be grown by the specified amount
+    /// of wasm pages.
+    fn grow(&mut self, delta: Pages) -> Result<Pages, MemoryError> {
+        let mut guard = self.mmap.write().unwrap();
+        guard.grow(delta, self.config.clone())
+    }
+
+    /// Return a `VMMemoryDefinition` for exposing the memory to compiled wasm code.
+    fn vmmemory(&self) -> NonNull<VMMemoryDefinition> {
+        let guard = self.mmap.read().unwrap();
+        guard.vm_memory_definition.as_ptr()
+    }
+
+    /// Owned memory can not be cloned (this will always return None)
+    fn try_clone(&self) -> Option<Box<dyn LinearMemory + 'static>> {
+        None
+    }
+}
+
 impl From<VMOwnedMemory> for VMMemory {
     fn from(mem: VMOwnedMemory) -> Self {
         Self(Box::new(mem))
     }
 }
 
+impl From<VMSharedMemory> for VMMemory {
+    fn from(mem: VMSharedMemory) -> Self {
+        Self(Box::new(mem))
+    }
+}
+
 /// Represents linear memory that can be either owned or shared
 #[derive(Debug)]
 pub struct VMMemory(pub Box<dyn LinearMemory + 'static>);
@@ -357,8 +453,12 @@ impl VMMemory {
     ///
     /// This creates a `Memory` with owned metadata: this can be used to create a memory
     /// that will be imported into Wasm modules.
-    pub fn new(memory: &MemoryType, style: &MemoryStyle) -> Result<Self, MemoryError> {
-        Ok(Self(Box::new(VMOwnedMemory::new(memory, style)?)))
+    pub fn new(memory: &MemoryType, style: &MemoryStyle) -> Result<VMMemory, MemoryError> {
+        Ok(if memory.shared {
+            Self(Box::new(VMSharedMemory::new(memory, style)?))
+        } else {
+            Self(Box::new(VMOwnedMemory::new(memory, style)?))
+        })
     }
 
     /// Returns the number of pages in the allocated memory block
@@ -377,12 +477,20 @@ impl VMMemory {
         memory: &MemoryType,
         style: &MemoryStyle,
         vm_memory_location: NonNull<VMMemoryDefinition>,
-    ) -> Result<Self, MemoryError> {
-        Ok(Self(Box::new(VMOwnedMemory::from_definition(
-            memory,
-            style,
-            vm_memory_location,
-        )?)))
+    ) -> Result<VMMemory, MemoryError> {
+        Ok(if memory.shared {
+            Self(Box::new(VMSharedMemory::from_definition(
+                memory,
+                style,
+                vm_memory_location,
+            )?))
+        } else {
+            Self(Box::new(VMOwnedMemory::from_definition(
+                memory,
+                style,
+                vm_memory_location,
+            )?))
+        })
     }
 
     /// Creates VMMemory from a custom implementation - the following into implementations
diff --git a/lib/vm/src/store.rs b/lib/vm/src/store.rs
index ea11d0582a9..69ed19856c7 100644
--- a/lib/vm/src/store.rs
+++ b/lib/vm/src/store.rs
@@ -78,6 +78,11 @@ impl StoreObjects {
         self.id
     }
 
+    /// Sets the ID of this store
+    pub fn set_id(&mut self, id: StoreId) {
+        self.id = id;
+    }
+
     /// Returns a pair of mutable references from two handles.
     ///
     /// Panics if both handles point to the same object.

From 34cba1c59fdd144894e45d0ff54e3adbdb32c480 Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Mon, 29 Aug 2022 18:20:00 +0200
Subject: [PATCH 02/24] Added spec threads tests, but disabling all for now

Fixed linter
---
 build.rs          |  5 +++++
 tests/ignores.txt | 14 ++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/build.rs b/build.rs
index 50e0074a31b..55abbf6d6f9 100644
--- a/build.rs
+++ b/build.rs
@@ -42,6 +42,11 @@ fn main() -> anyhow::Result<()> {
                 wast_processor,
             )?;
             test_directory_module(spectests, "tests/wast/spec/proposals/simd", wast_processor)?;
+            test_directory_module(
+                spectests,
+                "tests/wast/spec/proposals/threads",
+                wast_processor,
+            )?;
             // test_directory_module(spectests, "tests/wast/spec/proposals/bulk-memory-operations", wast_processor)?;
             Ok(())
         })?;
diff --git a/tests/ignores.txt b/tests/ignores.txt
index 39978d01cd9..5fac104550d 100644
--- a/tests/ignores.txt
+++ b/tests/ignores.txt
@@ -23,6 +23,20 @@ singlepass+aarch64+macos traps::start_trap_pretty
 llvm       traps::start_trap_pretty
 cranelift+aarch64+macos    traps::start_trap_pretty
 
+# Atomics (WIP)
+singlepass spec::threads::atomic
+singlepass spec::threads::exports
+singlepass spec::threads::imports
+singlepass spec::threads::memory
+cranelift spec::threads::atomic
+cranelift spec::threads::exports
+cranelift spec::threads::imports
+cranelift spec::threads::memory
+llvm spec::threads::atomic
+llvm spec::threads::exports
+llvm spec::threads::imports
+llvm spec::threads::memory
+
 # Also neither LLVM nor Cranelift currently implement stack probing on AArch64.
 # https://github.com/wasmerio/wasmer/issues/2808
 cranelift+aarch64 spec::skip_stack_guard_page

From 013d63430dbc8c95a63d43ccfcb75839c89d0384 Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Tue, 30 Aug 2022 13:01:10 +0200
Subject: [PATCH 03/24] Enabled threads and already working tests

---
 lib/types/src/features.rs | 4 ++--
 tests/ignores.txt         | 6 ------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/lib/types/src/features.rs b/lib/types/src/features.rs
index 34eace4658b..395c55d837b 100644
--- a/lib/types/src/features.rs
+++ b/lib/types/src/features.rs
@@ -41,7 +41,7 @@ impl Features {
     /// Create a new feature
     pub fn new() -> Self {
         Self {
-            threads: false,
+            threads: true,
             // Reference types should be on by default
             reference_types: true,
             // SIMD should be on by default
@@ -249,7 +249,7 @@ mod test_features {
         assert_eq!(
             default,
             Features {
-                threads: false,
+                threads: true,
                 reference_types: true,
                 simd: true,
                 bulk_memory: true,
diff --git a/tests/ignores.txt b/tests/ignores.txt
index 5fac104550d..d3b4332ca0f 100644
--- a/tests/ignores.txt
+++ b/tests/ignores.txt
@@ -25,17 +25,11 @@ cranelift+aarch64+macos    traps::start_trap_pretty
 
 # Atomics (WIP)
 singlepass spec::threads::atomic
-singlepass spec::threads::exports
 singlepass spec::threads::imports
-singlepass spec::threads::memory
 cranelift spec::threads::atomic
-cranelift spec::threads::exports
 cranelift spec::threads::imports
-cranelift spec::threads::memory
 llvm spec::threads::atomic
-llvm spec::threads::exports
 llvm spec::threads::imports
-llvm spec::threads::memory
 
 # Also neither LLVM nor Cranelift currently implement stack probing on AArch64.
 # https://github.com/wasmerio/wasmer/issues/2808

From 2b7bf2fbe4b40e8f2a123dddc521491880aee1fc Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Wed, 31 Aug 2022 13:27:22 +0200
Subject: [PATCH 04/24] Added some missing emitter on x86_64 singlepass (for
 #3161)

Checked align_check for x86_64 singlepass compiler (for #3161)"

Added proper handling of Unaligned Atomics in Singlepass Compiler (for #3161)

More fixes to x86_64 singlepass compiler (for #3161)
---
 lib/compiler-singlepass/src/codegen.rs       | 699 ++++++++++++++++---
 lib/compiler-singlepass/src/emitter_x64.rs   |  14 +
 lib/compiler-singlepass/src/machine.rs       |  86 +++
 lib/compiler-singlepass/src/machine_arm64.rs | 113 ++-
 lib/compiler-singlepass/src/machine_x64.rs   | 185 ++++-
 5 files changed, 1004 insertions(+), 93 deletions(-)

diff --git a/lib/compiler-singlepass/src/codegen.rs b/lib/compiler-singlepass/src/codegen.rs
index f045982e898..9e39b9964ba 100644
--- a/lib/compiler-singlepass/src/codegen.rs
+++ b/lib/compiler-singlepass/src/codegen.rs
@@ -94,6 +94,7 @@ struct SpecialLabelSet {
     table_access_oob: Label,
     indirect_call_null: Label,
     bad_signature: Label,
+    unaligned_atomic: Label,
 }
 
 /// Metadata about a floating-point value.
@@ -1012,7 +1013,9 @@ impl<'a, M: Machine> FuncGen<'a, M> {
     }
 
     /// Emits a memory operation.
-    fn op_memory<F: FnOnce(&mut Self, bool, bool, i32, Label) -> Result<(), CompileError>>(
+    fn op_memory<
+        F: FnOnce(&mut Self, bool, bool, i32, Label, Label) -> Result<(), CompileError>,
+    >(
         &mut self,
         cb: F,
     ) -> Result<(), CompileError> {
@@ -1034,6 +1037,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
             self.module.num_imported_memories != 0,
             offset as i32,
             self.special_labels.heap_access_oob,
+            self.special_labels.unaligned_atomic,
         )
     }
 
@@ -1134,6 +1138,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
             table_access_oob: machine.get_label(),
             indirect_call_null: machine.get_label(),
             bad_signature: machine.get_label(),
+            unaligned_atomic: machine.get_label(),
         };
 
         let fsm = FunctionStateMap::new(
@@ -3370,7 +3375,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_load(
                             target,
                             memarg,
@@ -3379,6 +3389,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3393,7 +3404,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 self.fp_stack
                     .push(FloatValue::new(self.value_stack.len() - 1));
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.f32_load(
                             target,
                             memarg,
@@ -3402,6 +3418,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3414,7 +3431,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_load_8u(
                             target,
                             memarg,
@@ -3423,6 +3445,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3435,7 +3458,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_load_8s(
                             target,
                             memarg,
@@ -3444,6 +3472,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3456,7 +3485,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_load_16u(
                             target,
                             memarg,
@@ -3465,6 +3499,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3477,7 +3512,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_load_16s(
                             target,
                             memarg,
@@ -3486,6 +3526,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3494,7 +3535,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let target_value = self.pop_value_released()?;
                 let target_addr = self.pop_value_released()?;
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_save(
                             target_value,
                             memarg,
@@ -3503,6 +3549,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3513,7 +3560,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let fp = self.fp_stack.pop1()?;
                 let config_nan_canonicalization = self.config.enable_nan_canonicalization;
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.f32_save(
                             target_value,
                             memarg,
@@ -3523,6 +3575,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3531,7 +3584,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let target_value = self.pop_value_released()?;
                 let target_addr = self.pop_value_released()?;
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_save_8(
                             target_value,
                             memarg,
@@ -3540,6 +3598,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3548,7 +3607,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let target_value = self.pop_value_released()?;
                 let target_addr = self.pop_value_released()?;
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_save_16(
                             target_value,
                             memarg,
@@ -3557,6 +3621,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3569,7 +3634,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_load(
                             target,
                             memarg,
@@ -3578,6 +3648,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3592,7 +3663,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 self.fp_stack
                     .push(FloatValue::new(self.value_stack.len() - 1));
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.f64_load(
                             target,
                             memarg,
@@ -3601,6 +3677,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3613,7 +3690,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_load_8u(
                             target,
                             memarg,
@@ -3622,6 +3704,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3634,7 +3717,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_load_8s(
                             target,
                             memarg,
@@ -3643,6 +3731,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3655,7 +3744,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_load_16u(
                             target,
                             memarg,
@@ -3664,6 +3758,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3676,7 +3771,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_load_16s(
                             target,
                             memarg,
@@ -3685,6 +3785,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3697,7 +3798,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_load_32u(
                             target,
                             memarg,
@@ -3706,6 +3812,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3718,7 +3825,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_load_32s(
                             target,
                             memarg,
@@ -3727,6 +3839,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3736,7 +3849,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let target_addr = self.pop_value_released()?;
 
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_save(
                             target_value,
                             memarg,
@@ -3745,6 +3863,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3755,7 +3874,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let fp = self.fp_stack.pop1()?;
                 let config_nan_canonicalization = self.config.enable_nan_canonicalization;
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.f64_save(
                             target_value,
                             memarg,
@@ -3765,6 +3889,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3773,7 +3898,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let target_value = self.pop_value_released()?;
                 let target_addr = self.pop_value_released()?;
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_save_8(
                             target_value,
                             memarg,
@@ -3782,6 +3912,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3790,7 +3921,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let target_value = self.pop_value_released()?;
                 let target_addr = self.pop_value_released()?;
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_save_16(
                             target_value,
                             memarg,
@@ -3799,6 +3935,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -3807,7 +3944,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let target_value = self.pop_value_released()?;
                 let target_addr = self.pop_value_released()?;
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_save_32(
                             target_value,
                             memarg,
@@ -3816,6 +3958,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4112,7 +4255,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_load(
                             target,
                             memarg,
@@ -4121,6 +4269,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4133,7 +4282,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_load_8u(
                             target,
                             memarg,
@@ -4142,6 +4296,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4154,7 +4309,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_load_16u(
                             target,
                             memarg,
@@ -4163,6 +4323,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4171,7 +4332,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let target_value = self.pop_value_released()?;
                 let target_addr = self.pop_value_released()?;
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_save(
                             target_value,
                             memarg,
@@ -4180,6 +4346,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4188,7 +4355,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let target_value = self.pop_value_released()?;
                 let target_addr = self.pop_value_released()?;
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_save_8(
                             target_value,
                             memarg,
@@ -4197,6 +4369,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4205,7 +4378,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let target_value = self.pop_value_released()?;
                 let target_addr = self.pop_value_released()?;
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_save_16(
                             target_value,
                             memarg,
@@ -4214,6 +4392,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4226,7 +4405,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_load(
                             target,
                             memarg,
@@ -4235,6 +4419,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4247,7 +4432,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_load_8u(
                             target,
                             memarg,
@@ -4256,6 +4446,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4268,7 +4459,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_load_16u(
                             target,
                             memarg,
@@ -4277,6 +4473,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4289,7 +4486,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_load_32u(
                             target,
                             memarg,
@@ -4298,6 +4500,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4306,7 +4509,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let target_value = self.pop_value_released()?;
                 let target_addr = self.pop_value_released()?;
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_save(
                             target_value,
                             memarg,
@@ -4315,6 +4523,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4323,7 +4532,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let target_value = self.pop_value_released()?;
                 let target_addr = self.pop_value_released()?;
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_save_8(
                             target_value,
                             memarg,
@@ -4332,6 +4546,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4340,7 +4555,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let target_value = self.pop_value_released()?;
                 let target_addr = self.pop_value_released()?;
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_save_16(
                             target_value,
                             memarg,
@@ -4349,6 +4569,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4357,7 +4578,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 let target_value = self.pop_value_released()?;
                 let target_addr = self.pop_value_released()?;
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_save_32(
                             target_value,
                             memarg,
@@ -4366,6 +4592,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4379,7 +4606,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_add(
                             loc,
                             target,
@@ -4389,6 +4621,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4402,7 +4635,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_add(
                             loc,
                             target,
@@ -4412,6 +4650,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4425,7 +4664,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_add_8u(
                             loc,
                             target,
@@ -4435,6 +4679,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4448,7 +4693,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_add_16u(
                             loc,
                             target,
@@ -4458,6 +4708,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4471,7 +4722,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_add_8u(
                             loc,
                             target,
@@ -4481,6 +4737,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4494,7 +4751,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_add_16u(
                             loc,
                             target,
@@ -4504,6 +4766,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4517,7 +4780,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_add_32u(
                             loc,
                             target,
@@ -4527,6 +4795,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4540,7 +4809,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_sub(
                             loc,
                             target,
@@ -4550,6 +4824,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4563,7 +4838,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_sub(
                             loc,
                             target,
@@ -4573,6 +4853,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4586,7 +4867,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_sub_8u(
                             loc,
                             target,
@@ -4596,6 +4882,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4609,7 +4896,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_sub_16u(
                             loc,
                             target,
@@ -4619,6 +4911,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4632,7 +4925,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_sub_8u(
                             loc,
                             target,
@@ -4642,6 +4940,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4655,7 +4954,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_sub_16u(
                             loc,
                             target,
@@ -4665,6 +4969,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4678,7 +4983,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_sub_32u(
                             loc,
                             target,
@@ -4688,6 +4998,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4701,7 +5012,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_and(
                             loc,
                             target,
@@ -4711,6 +5027,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4724,7 +5041,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_and(
                             loc,
                             target,
@@ -4734,6 +5056,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4747,7 +5070,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_and_8u(
                             loc,
                             target,
@@ -4757,6 +5085,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4770,7 +5099,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_and_16u(
                             loc,
                             target,
@@ -4780,6 +5114,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4793,7 +5128,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_and_8u(
                             loc,
                             target,
@@ -4803,6 +5143,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4816,7 +5157,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_and_16u(
                             loc,
                             target,
@@ -4826,6 +5172,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4839,7 +5186,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_and_32u(
                             loc,
                             target,
@@ -4849,6 +5201,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4862,7 +5215,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_or(
                             loc,
                             target,
@@ -4872,6 +5230,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4885,7 +5244,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_or(
                             loc,
                             target,
@@ -4895,6 +5259,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4908,7 +5273,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_or_8u(
                             loc,
                             target,
@@ -4918,6 +5288,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4931,7 +5302,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_or_16u(
                             loc,
                             target,
@@ -4941,6 +5317,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4954,7 +5331,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_or_8u(
                             loc,
                             target,
@@ -4964,6 +5346,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -4977,7 +5360,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_or_16u(
                             loc,
                             target,
@@ -4987,6 +5375,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5000,7 +5389,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_or_32u(
                             loc,
                             target,
@@ -5010,6 +5404,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5023,7 +5418,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_xor(
                             loc,
                             target,
@@ -5033,6 +5433,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5046,7 +5447,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_xor(
                             loc,
                             target,
@@ -5056,6 +5462,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5069,7 +5476,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_xor_8u(
                             loc,
                             target,
@@ -5079,6 +5491,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5092,7 +5505,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_xor_16u(
                             loc,
                             target,
@@ -5102,6 +5520,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5115,7 +5534,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_xor_8u(
                             loc,
                             target,
@@ -5125,6 +5549,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5138,7 +5563,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_xor_16u(
                             loc,
                             target,
@@ -5148,6 +5578,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5161,7 +5592,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_xor_32u(
                             loc,
                             target,
@@ -5171,6 +5607,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5184,7 +5621,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_xchg(
                             loc,
                             target,
@@ -5194,6 +5636,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5207,7 +5650,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_xchg(
                             loc,
                             target,
@@ -5217,6 +5665,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5230,7 +5679,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_xchg_8u(
                             loc,
                             target,
@@ -5240,6 +5694,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5253,7 +5708,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_xchg_16u(
                             loc,
                             target,
@@ -5263,6 +5723,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5276,7 +5737,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_xchg_8u(
                             loc,
                             target,
@@ -5286,6 +5752,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5299,7 +5766,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_xchg_16u(
                             loc,
                             target,
@@ -5309,6 +5781,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5322,7 +5795,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_xchg_32u(
                             loc,
                             target,
@@ -5332,6 +5810,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5346,7 +5825,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_cmpxchg(
                             new,
                             cmp,
@@ -5357,6 +5841,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5371,7 +5856,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_cmpxchg(
                             new,
                             cmp,
@@ -5382,6 +5872,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5396,7 +5887,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_cmpxchg_8u(
                             new,
                             cmp,
@@ -5407,6 +5903,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5421,7 +5918,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i32_atomic_cmpxchg_16u(
                             new,
                             cmp,
@@ -5432,6 +5934,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5446,7 +5949,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_cmpxchg_8u(
                             new,
                             cmp,
@@ -5457,6 +5965,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5471,7 +5980,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_cmpxchg_16u(
                             new,
                             cmp,
@@ -5482,6 +5996,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5496,7 +6011,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?[0];
                 self.value_stack.push(ret);
                 self.op_memory(
-                    |this, need_check, imported_memories, offset, heap_access_oob| {
+                    |this,
+                     need_check,
+                     imported_memories,
+                     offset,
+                     heap_access_oob,
+                     unaligned_atomic| {
                         this.machine.i64_atomic_cmpxchg_32u(
                             new,
                             cmp,
@@ -5507,6 +6027,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             imported_memories,
                             offset,
                             heap_access_oob,
+                            unaligned_atomic,
                         )
                     },
                 )?;
@@ -5938,6 +6459,10 @@ impl<'a, M: Machine> FuncGen<'a, M> {
         self.machine.emit_label(self.special_labels.bad_signature)?;
         self.machine.emit_illegal_op(TrapCode::BadSignature)?;
 
+        self.machine
+            .emit_label(self.special_labels.unaligned_atomic)?;
+        self.machine.emit_illegal_op(TrapCode::UnalignedAtomic)?;
+
         // Notify the assembler backend to generate necessary code at end of function.
         self.machine.finalize_function()?;
 
diff --git a/lib/compiler-singlepass/src/emitter_x64.rs b/lib/compiler-singlepass/src/emitter_x64.rs
index 0b45f562b8b..736f8fb48ee 100644
--- a/lib/compiler-singlepass/src/emitter_x64.rs
+++ b/lib/compiler-singlepass/src/emitter_x64.rs
@@ -1404,6 +1404,9 @@ impl EmitterX64 for AssemblerX64 {
             (Size::S16, Location::Memory(src, disp), Size::S32, Location::GPR(dst)) => {
                 dynasm!(self ; movzx Rd(dst as u8), WORD [Rq(src as u8) + disp]);
             }
+            (Size::S16, Location::Imm32(imm), Size::S32, Location::GPR(dst)) => {
+                dynasm!(self ; mov Rd(dst as u8), imm as i32);
+            }
             (Size::S8, Location::GPR(src), Size::S64, Location::GPR(dst)) => {
                 dynasm!(self ; movzx Rq(dst as u8), Rb(src as u8));
             }
@@ -1416,6 +1419,17 @@ impl EmitterX64 for AssemblerX64 {
             (Size::S16, Location::Memory(src, disp), Size::S64, Location::GPR(dst)) => {
                 dynasm!(self ; movzx Rq(dst as u8), WORD [Rq(src as u8) + disp]);
             }
+            (Size::S32, Location::GPR(src), Size::S64, Location::GPR(dst)) => {
+                if src != dst {
+                    dynasm!(self ; mov Rd(dst as u8), Rd(src as u8));
+                }
+            }
+            (Size::S32, Location::Memory(src, disp), Size::S64, Location::GPR(dst)) => {
+                dynasm!(self ; mov Rd(dst as u8), DWORD [Rq(src as u8) + disp]);
+            }
+            (Size::S32, Location::Imm64(imm), Size::S64, Location::GPR(dst)) => {
+                dynasm!(self ; mov Rq(dst as u8), imm as i32);
+            }
             _ => {
                 codegen_error!(
                     "singlepass can't emit MOVZX {:?} {:?} {:?} {:?}",
diff --git a/lib/compiler-singlepass/src/machine.rs b/lib/compiler-singlepass/src/machine.rs
index cf89e5232ef..7f8239fbc4b 100644
--- a/lib/compiler-singlepass/src/machine.rs
+++ b/lib/compiler-singlepass/src/machine.rs
@@ -669,6 +669,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 load of an unsigned 8bits
     #[allow(clippy::too_many_arguments)]
@@ -681,6 +682,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 load of an signed 8bits
     #[allow(clippy::too_many_arguments)]
@@ -693,6 +695,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 load of an unsigned 16bits
     #[allow(clippy::too_many_arguments)]
@@ -705,6 +708,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 load of an signed 16bits
     #[allow(clippy::too_many_arguments)]
@@ -717,6 +721,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic load
     #[allow(clippy::too_many_arguments)]
@@ -729,6 +734,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic load of an unsigned 8bits
     #[allow(clippy::too_many_arguments)]
@@ -741,6 +747,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic load of an unsigned 16bits
     #[allow(clippy::too_many_arguments)]
@@ -753,6 +760,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 save
     #[allow(clippy::too_many_arguments)]
@@ -765,6 +773,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 save of the lower 8bits
     #[allow(clippy::too_many_arguments)]
@@ -777,6 +786,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 save of the lower 16bits
     #[allow(clippy::too_many_arguments)]
@@ -789,6 +799,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic save
     #[allow(clippy::too_many_arguments)]
@@ -801,6 +812,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic save of a the lower 8bits
     #[allow(clippy::too_many_arguments)]
@@ -813,6 +825,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic save of a the lower 16bits
     #[allow(clippy::too_many_arguments)]
@@ -825,6 +838,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Add with i32
     #[allow(clippy::too_many_arguments)]
@@ -838,6 +852,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Add with unsigned 8bits
     #[allow(clippy::too_many_arguments)]
@@ -851,6 +866,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Add with unsigned 16bits
     #[allow(clippy::too_many_arguments)]
@@ -864,6 +880,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Sub with i32
     #[allow(clippy::too_many_arguments)]
@@ -877,6 +894,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Sub with unsigned 8bits
     #[allow(clippy::too_many_arguments)]
@@ -890,6 +908,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Sub with unsigned 16bits
     #[allow(clippy::too_many_arguments)]
@@ -903,6 +922,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic And with i32
     #[allow(clippy::too_many_arguments)]
@@ -916,6 +936,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic And with unsigned 8bits
     #[allow(clippy::too_many_arguments)]
@@ -929,6 +950,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic And with unsigned 16bits
     #[allow(clippy::too_many_arguments)]
@@ -942,6 +964,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Or with i32
     #[allow(clippy::too_many_arguments)]
@@ -955,6 +978,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Or with unsigned 8bits
     #[allow(clippy::too_many_arguments)]
@@ -968,6 +992,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Or with unsigned 16bits
     #[allow(clippy::too_many_arguments)]
@@ -981,6 +1006,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Xor with i32
     #[allow(clippy::too_many_arguments)]
@@ -994,6 +1020,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Xor with unsigned 8bits
     #[allow(clippy::too_many_arguments)]
@@ -1007,6 +1034,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Xor with unsigned 16bits
     #[allow(clippy::too_many_arguments)]
@@ -1020,6 +1048,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Exchange with i32
     #[allow(clippy::too_many_arguments)]
@@ -1033,6 +1062,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Exchange with u8
     #[allow(clippy::too_many_arguments)]
@@ -1046,6 +1076,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Exchange with u16
     #[allow(clippy::too_many_arguments)]
@@ -1059,6 +1090,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Compare and Exchange with i32
     #[allow(clippy::too_many_arguments)]
@@ -1073,6 +1105,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Compare and Exchange with u8
     #[allow(clippy::too_many_arguments)]
@@ -1087,6 +1120,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i32 atomic Compare and Exchange with u16
     #[allow(clippy::too_many_arguments)]
@@ -1101,6 +1135,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
 
     /// emit a move function address to GPR ready for call, using appropriate relocation
@@ -1321,6 +1356,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 load of an unsigned 8bits
     #[allow(clippy::too_many_arguments)]
@@ -1333,6 +1369,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 load of an signed 8bits
     #[allow(clippy::too_many_arguments)]
@@ -1345,6 +1382,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 load of an unsigned 32bits
     #[allow(clippy::too_many_arguments)]
@@ -1357,6 +1395,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 load of an signed 32bits
     #[allow(clippy::too_many_arguments)]
@@ -1369,6 +1408,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 load of an signed 16bits
     #[allow(clippy::too_many_arguments)]
@@ -1381,6 +1421,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 load of an signed 16bits
     #[allow(clippy::too_many_arguments)]
@@ -1393,6 +1434,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic load
     #[allow(clippy::too_many_arguments)]
@@ -1405,6 +1447,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic load from unsigned 8bits
     #[allow(clippy::too_many_arguments)]
@@ -1417,6 +1460,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic load from unsigned 16bits
     #[allow(clippy::too_many_arguments)]
@@ -1429,6 +1473,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic load from unsigned 32bits
     #[allow(clippy::too_many_arguments)]
@@ -1441,6 +1486,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 save
     #[allow(clippy::too_many_arguments)]
@@ -1453,6 +1499,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 save of the lower 8bits
     #[allow(clippy::too_many_arguments)]
@@ -1465,6 +1512,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 save of the lower 16bits
     #[allow(clippy::too_many_arguments)]
@@ -1477,6 +1525,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 save of the lower 32bits
     #[allow(clippy::too_many_arguments)]
@@ -1489,6 +1538,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic save
     #[allow(clippy::too_many_arguments)]
@@ -1501,6 +1551,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic save of a the lower 8bits
     #[allow(clippy::too_many_arguments)]
@@ -1513,6 +1564,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic save of a the lower 16bits
     #[allow(clippy::too_many_arguments)]
@@ -1525,6 +1577,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic save of a the lower 32bits
     #[allow(clippy::too_many_arguments)]
@@ -1537,6 +1590,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Add with i64
     #[allow(clippy::too_many_arguments)]
@@ -1550,6 +1604,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Add with unsigned 8bits
     #[allow(clippy::too_many_arguments)]
@@ -1563,6 +1618,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Add with unsigned 16bits
     #[allow(clippy::too_many_arguments)]
@@ -1576,6 +1632,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Add with unsigned 32bits
     #[allow(clippy::too_many_arguments)]
@@ -1589,6 +1646,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Sub with i64
     #[allow(clippy::too_many_arguments)]
@@ -1602,6 +1660,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Sub with unsigned 8bits
     #[allow(clippy::too_many_arguments)]
@@ -1615,6 +1674,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Sub with unsigned 16bits
     #[allow(clippy::too_many_arguments)]
@@ -1628,6 +1688,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Sub with unsigned 32bits
     #[allow(clippy::too_many_arguments)]
@@ -1641,6 +1702,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic And with i64
     #[allow(clippy::too_many_arguments)]
@@ -1654,6 +1716,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic And with unsigned 8bits
     #[allow(clippy::too_many_arguments)]
@@ -1667,6 +1730,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic And with unsigned 16bits
     #[allow(clippy::too_many_arguments)]
@@ -1680,6 +1744,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic And with unsigned 32bits
     #[allow(clippy::too_many_arguments)]
@@ -1693,6 +1758,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Or with i64
     #[allow(clippy::too_many_arguments)]
@@ -1706,6 +1772,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Or with unsigned 8bits
     #[allow(clippy::too_many_arguments)]
@@ -1719,6 +1786,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Or with unsigned 16bits
     #[allow(clippy::too_many_arguments)]
@@ -1732,6 +1800,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Or with unsigned 32bits
     #[allow(clippy::too_many_arguments)]
@@ -1745,6 +1814,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Xor with i64
     #[allow(clippy::too_many_arguments)]
@@ -1758,6 +1828,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Xor with unsigned 8bits
     #[allow(clippy::too_many_arguments)]
@@ -1771,6 +1842,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Xor with unsigned 16bits
     #[allow(clippy::too_many_arguments)]
@@ -1784,6 +1856,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Xor with unsigned 32bits
     #[allow(clippy::too_many_arguments)]
@@ -1797,6 +1870,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Exchange with i64
     #[allow(clippy::too_many_arguments)]
@@ -1810,6 +1884,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Exchange with u8
     #[allow(clippy::too_many_arguments)]
@@ -1823,6 +1898,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Exchange with u16
     #[allow(clippy::too_many_arguments)]
@@ -1836,6 +1912,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Exchange with u32
     #[allow(clippy::too_many_arguments)]
@@ -1849,6 +1926,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Compare and Exchange with i32
     #[allow(clippy::too_many_arguments)]
@@ -1863,6 +1941,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Compare and Exchange with u8
     #[allow(clippy::too_many_arguments)]
@@ -1877,6 +1956,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Compare and Exchange with u16
     #[allow(clippy::too_many_arguments)]
@@ -1891,6 +1971,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// i64 atomic Compare and Exchange with u32
     #[allow(clippy::too_many_arguments)]
@@ -1905,6 +1986,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
 
     /// load an F32
@@ -1918,6 +2000,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// f32 save
     #[allow(clippy::too_many_arguments)]
@@ -1931,6 +2014,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// load an F64
     #[allow(clippy::too_many_arguments)]
@@ -1943,6 +2027,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// f64 save
     #[allow(clippy::too_many_arguments)]
@@ -1956,6 +2041,7 @@ pub trait Machine {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError>;
     /// Convert a F64 from I64, signed or unsigned
     fn convert_f64_i64(
diff --git a/lib/compiler-singlepass/src/machine_arm64.rs b/lib/compiler-singlepass/src/machine_arm64.rs
index bab8ffa8fe8..806bb33eb1e 100644
--- a/lib/compiler-singlepass/src/machine_arm64.rs
+++ b/lib/compiler-singlepass/src/machine_arm64.rs
@@ -967,6 +967,7 @@ impl MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
         cb: F,
     ) -> Result<(), CompileError> {
         let tmp_addr = self.acquire_temp_gpr().ok_or_else(|| {
@@ -1103,7 +1104,7 @@ impl MachineARM64 {
                 Location::GPR(tmp_addr),
             )?;
             self.assembler
-                .emit_bcond_label_far(Condition::Ne, heap_access_oob)?;
+                .emit_bcond_label_far(Condition::Ne, unaligned_atomic)?;
         }
         let begin = self.assembler.get_offset().0;
         cb(self, tmp_addr)?;
@@ -1127,6 +1128,7 @@ impl MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
         _cb: F,
     ) {
         unimplemented!();
@@ -3175,6 +3177,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -3185,6 +3188,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_ldr32(Size::S32, ret, Location::Memory(addr, 0)),
         )
     }
@@ -3197,6 +3201,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -3207,6 +3212,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_ldr8(Size::S32, ret, Location::Memory(addr, 0)),
         )
     }
@@ -3219,6 +3225,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -3229,6 +3236,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_ldr8s(Size::S32, ret, Location::Memory(addr, 0)),
         )
     }
@@ -3241,6 +3249,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -3251,6 +3260,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_ldr16(Size::S32, ret, Location::Memory(addr, 0)),
         )
     }
@@ -3263,6 +3273,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -3273,6 +3284,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_ldr16s(Size::S32, ret, Location::Memory(addr, 0)),
         )
     }
@@ -3285,6 +3297,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_load unimplemented");
     }
@@ -3297,6 +3310,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_load_8u unimplemented");
     }
@@ -3309,6 +3323,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_load_16u unimplemented");
     }
@@ -3321,6 +3336,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -3331,6 +3347,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_str32(target_value, Location::Memory(addr, 0)),
         )
     }
@@ -3343,6 +3360,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -3353,6 +3371,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_str8(target_value, Location::Memory(addr, 0)),
         )
     }
@@ -3365,6 +3384,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -3375,6 +3395,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_str16(target_value, Location::Memory(addr, 0)),
         )
     }
@@ -3387,6 +3408,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_save unimplemented");
     }
@@ -3399,6 +3421,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_save_8 unimplemented");
     }
@@ -3411,6 +3434,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_save_16 unimplemented");
     }
@@ -3425,6 +3449,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_add unimplemented");
     }
@@ -3439,6 +3464,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_add_8u unimplemented");
     }
@@ -3453,6 +3479,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_add_16u unimplemented");
     }
@@ -3467,6 +3494,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_sub unimplemented");
     }
@@ -3481,6 +3509,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_sub_8u unimplemented");
     }
@@ -3495,6 +3524,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_sub_16u unimplemented");
     }
@@ -3509,6 +3539,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_and unimplemented");
     }
@@ -3523,6 +3554,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_and_8u unimplemented");
     }
@@ -3537,6 +3569,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_and_16u unimplemented");
     }
@@ -3551,6 +3584,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_or unimplemented");
     }
@@ -3565,6 +3599,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_or_8u unimplemented");
     }
@@ -3579,6 +3614,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_or_16u unimplemented");
     }
@@ -3593,6 +3629,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_xor unimplemented");
     }
@@ -3607,6 +3644,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_xor_8u unimplemented");
     }
@@ -3621,6 +3659,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_xor_16u unimplemented");
     }
@@ -3635,6 +3674,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_xchg unimplemented");
     }
@@ -3649,6 +3689,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_xchg_8u unimplemented");
     }
@@ -3663,6 +3704,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_xchg_16u unimplemented");
     }
@@ -3678,6 +3720,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_cmpxchg unimplemented");
     }
@@ -3693,6 +3736,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_cmpxchg_8u unimplemented");
     }
@@ -3708,6 +3752,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i32_atomic_cmpxchg_16u unimplemented");
     }
@@ -4214,6 +4259,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -4224,6 +4270,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_ldr64(Size::S64, ret, Location::Memory(addr, 0)),
         )
     }
@@ -4236,6 +4283,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -4246,6 +4294,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_ldr8(Size::S64, ret, Location::Memory(addr, 0)),
         )
     }
@@ -4258,6 +4307,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -4268,6 +4318,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_ldr8s(Size::S64, ret, Location::Memory(addr, 0)),
         )
     }
@@ -4280,6 +4331,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -4290,6 +4342,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_ldr16(Size::S64, ret, Location::Memory(addr, 0)),
         )
     }
@@ -4302,6 +4355,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -4312,6 +4366,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_ldr16s(Size::S64, ret, Location::Memory(addr, 0)),
         )
     }
@@ -4324,6 +4379,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -4334,6 +4390,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_ldr32(Size::S64, ret, Location::Memory(addr, 0)),
         )
     }
@@ -4346,6 +4403,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -4356,6 +4414,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_ldr32s(Size::S64, ret, Location::Memory(addr, 0)),
         )
     }
@@ -4368,6 +4427,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_load unimplemented");
     }
@@ -4380,6 +4440,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_load_8u unimplemented");
     }
@@ -4392,6 +4453,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_load_16u unimplemented");
     }
@@ -4404,6 +4466,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_load_32u unimplemented");
     }
@@ -4416,6 +4479,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -4426,6 +4490,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_str64(target_value, Location::Memory(addr, 0)),
         )
     }
@@ -4438,6 +4503,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -4448,6 +4514,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_str8(target_value, Location::Memory(addr, 0)),
         )
     }
@@ -4460,6 +4527,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -4470,6 +4538,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_str16(target_value, Location::Memory(addr, 0)),
         )
     }
@@ -4482,6 +4551,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -4492,6 +4562,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_str32(target_value, Location::Memory(addr, 0)),
         )
     }
@@ -4504,6 +4575,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_save unimplemented");
     }
@@ -4516,6 +4588,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_save_8 unimplemented");
     }
@@ -4528,6 +4601,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_save_16 unimplemented");
     }
@@ -4540,6 +4614,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_save_32 unimplemented");
     }
@@ -4554,6 +4629,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_add unimplemented");
     }
@@ -4568,6 +4644,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_add_8u unimplemented");
     }
@@ -4582,6 +4659,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_add_16u unimplemented");
     }
@@ -4596,6 +4674,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_add_32u unimplemented");
     }
@@ -4610,6 +4689,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_sub unimplemented");
     }
@@ -4624,6 +4704,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_sub_8u unimplemented");
     }
@@ -4638,6 +4719,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_sub_16u unimplemented");
     }
@@ -4652,6 +4734,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_sub_32u unimplemented");
     }
@@ -4666,6 +4749,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_and unimplemented");
     }
@@ -4680,6 +4764,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_and_8u unimplemented");
     }
@@ -4694,6 +4779,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_and_16u unimplemented");
     }
@@ -4708,6 +4794,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_and_32u unimplemented");
     }
@@ -4722,6 +4809,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_or unimplemented");
     }
@@ -4736,6 +4824,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_or_8u unimplemented");
     }
@@ -4750,6 +4839,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_or_16u unimplemented");
     }
@@ -4764,6 +4854,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_or_32u unimplemented");
     }
@@ -4778,6 +4869,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_xor unimplemented");
     }
@@ -4792,6 +4884,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_xor_8u unimplemented");
     }
@@ -4806,6 +4899,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_xor_16u unimplemented");
     }
@@ -4820,6 +4914,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_xor_32u unimplemented");
     }
@@ -4834,6 +4929,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_xchg unimplemented");
     }
@@ -4848,6 +4944,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_xchg_8u unimplemented");
     }
@@ -4862,6 +4959,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_xchg_16u unimplemented");
     }
@@ -4876,6 +4974,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_xchg_32u unimplemented");
     }
@@ -4891,6 +4990,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_cmpxchg unimplemented");
     }
@@ -4906,6 +5006,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_cmpxchg_8u unimplemented");
     }
@@ -4921,6 +5022,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_cmpxchg_16u unimplemented");
     }
@@ -4936,6 +5038,7 @@ impl Machine for MachineARM64 {
         _imported_memories: bool,
         _offset: i32,
         _heap_access_oob: Label,
+        _unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         codegen_error!("singlepass i64_atomic_cmpxchg_32u unimplemented");
     }
@@ -4949,6 +5052,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -4959,6 +5063,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_ldr32(Size::S32, ret, Location::Memory(addr, 0)),
         )
     }
@@ -4972,6 +5077,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let canonicalize = canonicalize && self.arch_supports_canonicalize_nan();
         self.memory_op(
@@ -4983,6 +5089,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 if !canonicalize {
                     this.emit_relaxed_str32(target_value, Location::Memory(addr, 0))
@@ -5001,6 +5108,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -5011,6 +5119,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_ldr64(Size::S64, ret, Location::Memory(addr, 0)),
         )
     }
@@ -5024,6 +5133,7 @@ impl Machine for MachineARM64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let canonicalize = canonicalize && self.arch_supports_canonicalize_nan();
         self.memory_op(
@@ -5035,6 +5145,7 @@ impl Machine for MachineARM64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 if !canonicalize {
                     this.emit_relaxed_str64(target_value, Location::Memory(addr, 0))
diff --git a/lib/compiler-singlepass/src/machine_x64.rs b/lib/compiler-singlepass/src/machine_x64.rs
index 63df504707d..511b1ce8878 100644
--- a/lib/compiler-singlepass/src/machine_x64.rs
+++ b/lib/compiler-singlepass/src/machine_x64.rs
@@ -492,6 +492,7 @@ impl MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
         cb: F,
     ) -> Result<(), CompileError> {
         // This function as been re-writen to use only 2 temporary register instead of 3
@@ -590,7 +591,7 @@ impl MachineX86_64 {
 
         self.release_gpr(tmp2);
 
-        let align = memarg.align;
+        let align = value_size as u32;
         if check_alignment && align != 1 {
             let tmp_aligncheck = self.acquire_temp_gpr().ok_or_else(|| {
                 CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -602,11 +603,11 @@ impl MachineX86_64 {
             )?;
             self.assembler.emit_and(
                 Size::S64,
-                Location::Imm32((align - 1).into()),
+                Location::Imm32(align - 1),
                 Location::GPR(tmp_aligncheck),
             )?;
             self.assembler
-                .emit_jmp(Condition::NotEqual, heap_access_oob)?;
+                .emit_jmp(Condition::NotEqual, unaligned_atomic)?;
             self.release_gpr(tmp_aligncheck);
         }
         let begin = self.assembler.get_offset().0;
@@ -632,6 +633,7 @@ impl MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
         cb: F,
     ) -> Result<(), CompileError> {
         if memory_sz > stack_sz {
@@ -660,6 +662,7 @@ impl MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.load_address(memory_sz, Location::GPR(compare), Location::Memory(addr, 0))?;
                 this.move_location(stack_sz, Location::GPR(compare), ret)?;
@@ -3311,6 +3314,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -3321,6 +3325,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_binop(
                     AssemblerX64::emit_mov,
@@ -3340,6 +3345,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -3350,6 +3356,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
                     AssemblerX64::emit_movzx,
@@ -3370,6 +3377,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -3380,6 +3388,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
                     AssemblerX64::emit_movsx,
@@ -3400,6 +3409,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -3410,6 +3420,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
                     AssemblerX64::emit_movzx,
@@ -3430,6 +3441,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -3440,6 +3452,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
                     AssemblerX64::emit_movsx,
@@ -3460,6 +3473,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -3470,6 +3484,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_mov(Size::S32, Location::Memory(addr, 0), ret),
         )
     }
@@ -3482,6 +3497,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -3492,6 +3508,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_zero_extension(
                     Size::S8,
@@ -3511,6 +3528,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -3521,6 +3539,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_zero_extension(
                     Size::S16,
@@ -3540,6 +3559,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -3550,6 +3570,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_binop(
                     AssemblerX64::emit_mov,
@@ -3569,6 +3590,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -3579,6 +3601,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_binop(
                     AssemblerX64::emit_mov,
@@ -3598,6 +3621,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -3608,6 +3632,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_binop(
                     AssemblerX64::emit_mov,
@@ -3630,6 +3655,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -3640,6 +3666,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_binop(
                     AssemblerX64::emit_mov,
@@ -3659,6 +3686,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -3669,6 +3697,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_binop(
                     AssemblerX64::emit_mov,
@@ -3688,6 +3717,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -3698,6 +3728,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_binop(
                     AssemblerX64::emit_mov,
@@ -3719,6 +3750,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -3733,6 +3765,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_xadd(
                     Size::S32,
@@ -3756,6 +3789,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -3770,6 +3804,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_xadd(
                     Size::S8,
@@ -3793,6 +3828,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -3807,6 +3843,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_xadd(
                     Size::S16,
@@ -3830,6 +3867,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -3844,6 +3882,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_xadd(
                     Size::S32,
@@ -3867,6 +3906,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -3881,6 +3921,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_xadd(
                     Size::S8,
@@ -3904,6 +3945,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -3918,6 +3960,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_xadd(
                     Size::S16,
@@ -3941,6 +3984,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -3954,6 +3998,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.assembler
                     .emit_and(Size::S32, Location::GPR(src), Location::GPR(dst))
@@ -3971,6 +4016,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -3984,6 +4030,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.assembler
                     .emit_and(Size::S32, Location::GPR(src), Location::GPR(dst))
@@ -4001,6 +4048,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -4014,6 +4062,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.assembler
                     .emit_and(Size::S32, Location::GPR(src), Location::GPR(dst))
@@ -4031,6 +4080,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -4044,6 +4094,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.assembler
                     .emit_or(Size::S32, Location::GPR(src), Location::GPR(dst))
@@ -4061,6 +4112,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -4074,6 +4126,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.assembler
                     .emit_or(Size::S32, Location::GPR(src), Location::GPR(dst))
@@ -4091,6 +4144,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -4104,6 +4158,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.assembler
                     .emit_or(Size::S32, Location::GPR(src), Location::GPR(dst))
@@ -4121,6 +4176,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -4134,6 +4190,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.assembler
                     .emit_xor(Size::S32, Location::GPR(src), Location::GPR(dst))
@@ -4151,6 +4208,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -4164,6 +4222,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.assembler
                     .emit_xor(Size::S32, Location::GPR(src), Location::GPR(dst))
@@ -4181,6 +4240,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -4194,6 +4254,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.assembler
                     .emit_xor(Size::S32, Location::GPR(src), Location::GPR(dst))
@@ -4211,6 +4272,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -4225,6 +4287,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler
                     .emit_xchg(Size::S32, Location::GPR(value), Location::Memory(addr, 0))
@@ -4245,6 +4308,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -4260,6 +4324,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler
                     .emit_xchg(Size::S8, Location::GPR(value), Location::Memory(addr, 0))
@@ -4280,6 +4345,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -4295,6 +4361,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler
                     .emit_xchg(Size::S16, Location::GPR(value), Location::Memory(addr, 0))
@@ -4316,6 +4383,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let compare = self.reserve_unused_temp_gpr(GPR::RAX);
         let value = if cmp == Location::GPR(GPR::R14) {
@@ -4342,6 +4410,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_cmpxchg(
                     Size::S32,
@@ -4368,6 +4437,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let compare = self.reserve_unused_temp_gpr(GPR::RAX);
         let value = if cmp == Location::GPR(GPR::R14) {
@@ -4394,6 +4464,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_cmpxchg(
                     Size::S8,
@@ -4420,6 +4491,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let compare = self.reserve_unused_temp_gpr(GPR::RAX);
         let value = if cmp == Location::GPR(GPR::R14) {
@@ -4446,6 +4518,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_cmpxchg(
                     Size::S16,
@@ -4910,6 +4983,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -4920,6 +4994,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_binop(
                     AssemblerX64::emit_mov,
@@ -4939,6 +5014,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -4949,6 +5025,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
                     AssemblerX64::emit_movzx,
@@ -4969,6 +5046,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -4979,6 +5057,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
                     AssemblerX64::emit_movsx,
@@ -4999,6 +5078,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -5009,6 +5089,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
                     AssemblerX64::emit_movzx,
@@ -5029,6 +5110,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -5039,6 +5121,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
                     AssemblerX64::emit_movsx,
@@ -5059,6 +5142,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -5069,6 +5153,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 match ret {
                     Location::GPR(_) => {}
@@ -5101,6 +5186,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -5111,6 +5197,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
                     AssemblerX64::emit_movsx,
@@ -5131,6 +5218,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -5141,6 +5229,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_mov(Size::S64, Location::Memory(addr, 0), ret),
         )
     }
@@ -5153,6 +5242,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -5163,6 +5253,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_zero_extension(
                     Size::S8,
@@ -5182,6 +5273,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -5192,6 +5284,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_zero_extension(
                     Size::S16,
@@ -5211,6 +5304,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -5221,6 +5315,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 match ret {
                     Location::GPR(_) => {}
@@ -5253,6 +5348,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -5263,6 +5359,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_binop(
                     AssemblerX64::emit_mov,
@@ -5282,6 +5379,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -5292,6 +5390,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_binop(
                     AssemblerX64::emit_mov,
@@ -5311,6 +5410,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -5321,6 +5421,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_binop(
                     AssemblerX64::emit_mov,
@@ -5340,6 +5441,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -5350,6 +5452,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_binop(
                     AssemblerX64::emit_mov,
@@ -5369,6 +5472,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -5379,6 +5483,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_atomic_xchg(Size::S64, value, Location::Memory(addr, 0)),
         )
     }
@@ -5391,6 +5496,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -5401,6 +5507,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_atomic_xchg(Size::S8, value, Location::Memory(addr, 0)),
         )
     }
@@ -5413,6 +5520,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -5423,6 +5531,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_atomic_xchg(Size::S16, value, Location::Memory(addr, 0)),
         )
     }
@@ -5435,6 +5544,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             target_addr,
@@ -5445,6 +5555,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| this.emit_relaxed_atomic_xchg(Size::S32, value, Location::Memory(addr, 0)),
         )
     }
@@ -5459,6 +5570,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -5473,9 +5585,10 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_xadd(
-                    Size::S32,
+                    Size::S64,
                     Location::GPR(value),
                     Location::Memory(addr, 0),
                 )
@@ -5496,6 +5609,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -5510,6 +5624,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_xadd(
                     Size::S8,
@@ -5533,6 +5648,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -5547,6 +5663,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_xadd(
                     Size::S16,
@@ -5570,6 +5687,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -5584,6 +5702,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_xadd(
                     Size::S32,
@@ -5607,6 +5726,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -5621,6 +5741,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_xadd(
                     Size::S64,
@@ -5644,6 +5765,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -5658,6 +5780,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_xadd(
                     Size::S8,
@@ -5681,6 +5804,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -5695,6 +5819,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_xadd(
                     Size::S16,
@@ -5718,6 +5843,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -5732,6 +5858,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_xadd(
                     Size::S32,
@@ -5755,6 +5882,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -5768,6 +5896,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.assembler
                     .emit_and(Size::S64, Location::GPR(src), Location::GPR(dst))
@@ -5785,6 +5914,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -5798,6 +5928,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.assembler
                     .emit_and(Size::S64, Location::GPR(src), Location::GPR(dst))
@@ -5815,6 +5946,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -5828,6 +5960,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.assembler
                     .emit_and(Size::S64, Location::GPR(src), Location::GPR(dst))
@@ -5845,6 +5978,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -5858,6 +5992,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.assembler
                     .emit_and(Size::S64, Location::GPR(src), Location::GPR(dst))
@@ -5875,6 +6010,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -5888,6 +6024,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.location_or(Size::S64, Location::GPR(src), Location::GPR(dst), false)
             },
@@ -5904,6 +6041,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -5917,6 +6055,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.location_or(Size::S64, Location::GPR(src), Location::GPR(dst), false)
             },
@@ -5933,6 +6072,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -5946,6 +6086,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.location_or(Size::S64, Location::GPR(src), Location::GPR(dst), false)
             },
@@ -5962,6 +6103,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -5975,6 +6117,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.location_or(Size::S64, Location::GPR(src), Location::GPR(dst), false)
             },
@@ -5991,6 +6134,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -6004,6 +6148,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.location_xor(Size::S64, Location::GPR(src), Location::GPR(dst), false)
             },
@@ -6020,6 +6165,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -6033,6 +6179,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.location_xor(Size::S64, Location::GPR(src), Location::GPR(dst), false)
             },
@@ -6049,6 +6196,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -6062,6 +6210,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.location_xor(Size::S64, Location::GPR(src), Location::GPR(dst), false)
             },
@@ -6078,6 +6227,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.emit_compare_and_swap(
             loc,
@@ -6091,6 +6241,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, src, dst| {
                 this.location_xor(Size::S64, Location::GPR(src), Location::GPR(dst), false)
             },
@@ -6107,6 +6258,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -6121,6 +6273,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler
                     .emit_xchg(Size::S64, Location::GPR(value), Location::Memory(addr, 0))
@@ -6141,6 +6294,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -6156,6 +6310,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler
                     .emit_xchg(Size::S8, Location::GPR(value), Location::Memory(addr, 0))
@@ -6176,6 +6331,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -6191,6 +6347,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler
                     .emit_xchg(Size::S16, Location::GPR(value), Location::Memory(addr, 0))
@@ -6211,6 +6368,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let value = self.acquire_temp_gpr().ok_or_else(|| {
             CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
@@ -6226,6 +6384,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler
                     .emit_xchg(Size::S32, Location::GPR(value), Location::Memory(addr, 0))
@@ -6247,6 +6406,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let compare = self.reserve_unused_temp_gpr(GPR::RAX);
         let value = if cmp == Location::GPR(GPR::R14) {
@@ -6273,6 +6433,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_cmpxchg(
                     Size::S64,
@@ -6299,6 +6460,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let compare = self.reserve_unused_temp_gpr(GPR::RAX);
         let value = if cmp == Location::GPR(GPR::R14) {
@@ -6325,6 +6487,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_cmpxchg(
                     Size::S8,
@@ -6351,6 +6514,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let compare = self.reserve_unused_temp_gpr(GPR::RAX);
         let value = if cmp == Location::GPR(GPR::R14) {
@@ -6377,6 +6541,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_cmpxchg(
                     Size::S16,
@@ -6403,6 +6568,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let compare = self.reserve_unused_temp_gpr(GPR::RAX);
         let value = if cmp == Location::GPR(GPR::R14) {
@@ -6429,9 +6595,10 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.assembler.emit_lock_cmpxchg(
-                    Size::S16,
+                    Size::S32,
                     Location::GPR(value),
                     Location::Memory(addr, 0),
                 )?;
@@ -6453,6 +6620,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -6463,6 +6631,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_binop(
                     AssemblerX64::emit_mov,
@@ -6483,6 +6652,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let canonicalize = canonicalize && self.arch_supports_canonicalize_nan();
         self.memory_op(
@@ -6494,6 +6664,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 if !canonicalize {
                     this.emit_relaxed_binop(
@@ -6517,6 +6688,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         self.memory_op(
             addr,
@@ -6527,6 +6699,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 this.emit_relaxed_binop(
                     AssemblerX64::emit_mov,
@@ -6547,6 +6720,7 @@ impl Machine for MachineX86_64 {
         imported_memories: bool,
         offset: i32,
         heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
         let canonicalize = canonicalize && self.arch_supports_canonicalize_nan();
         self.memory_op(
@@ -6558,6 +6732,7 @@ impl Machine for MachineX86_64 {
             imported_memories,
             offset,
             heap_access_oob,
+            unaligned_atomic,
             |this, addr| {
                 if !canonicalize {
                     this.emit_relaxed_binop(

From 9bbcc8a3b2cf3088c09c7b8bdc3939c93b0e1e98 Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Tue, 6 Sep 2022 10:08:19 +0200
Subject: [PATCH 05/24] Added helper functions for WaitNotify opcodes (for
 #3155)

---
 lib/types/src/libcalls.rs  |  24 +++++
 lib/types/src/vmoffsets.rs |  26 ++++-
 lib/vm/src/instance/mod.rs | 205 ++++++++++++++++++++++++++++++++++++-
 lib/vm/src/libcalls.rs     | 151 +++++++++++++++++++++++++++
 lib/vm/src/vmcontext.rs    |  70 +++++++++++++
 5 files changed, 470 insertions(+), 6 deletions(-)

diff --git a/lib/types/src/libcalls.rs b/lib/types/src/libcalls.rs
index f794eab5a0f..e58d52ff817 100644
--- a/lib/types/src/libcalls.rs
+++ b/lib/types/src/libcalls.rs
@@ -115,6 +115,24 @@ pub enum LibCall {
     /// probe for stack overflow. These are emitted for functions which need
     /// when the `enable_probestack` setting is true.
     Probestack,
+
+    /// memory.atomic.wait32 for local memories
+    Memory32AtomicWait32,
+
+    /// memory.atomic.wait32 for imported memories
+    ImportedMemory32AtomicWait32,
+
+    /// memory.atomic.wait64 for local memories
+    Memory32AtomicWait64,
+
+    /// memory.atomic.wait64 for imported memories
+    ImportedMemory32AtomicWait64,
+
+    /// memory.atomic.notify for local memories
+    Memory32AtomicNotify,
+
+    /// memory.atomic.botify for imported memories
+    ImportedMemory32AtomicNotify,
 }
 
 impl LibCall {
@@ -157,6 +175,12 @@ impl LibCall {
             Self::Probestack => "_wasmer_vm_probestack",
             #[cfg(not(target_vendor = "apple"))]
             Self::Probestack => "wasmer_vm_probestack",
+            Self::Memory32AtomicWait32 => "wasmer_vm_memory32_atomic_wait32",
+            Self::ImportedMemory32AtomicWait32 => "wasmer_vm_imported_memory32_atomic_wait32",
+            Self::Memory32AtomicWait64 => "wasmer_vm_memory32_atomic_wait64",
+            Self::ImportedMemory32AtomicWait64 => "wasmer_vm_imported_memory32_atomic_wait64",
+            Self::Memory32AtomicNotify => "wasmer_vm_memory32_atomic_notify",
+            Self::ImportedMemory32AtomicNotify => "wasmer_vm_imported_memory32_atomic_notify",
         }
     }
 }
diff --git a/lib/types/src/vmoffsets.rs b/lib/types/src/vmoffsets.rs
index d894b446976..729adc1069d 100644
--- a/lib/types/src/vmoffsets.rs
+++ b/lib/types/src/vmoffsets.rs
@@ -115,9 +115,33 @@ impl VMBuiltinFunctionIndex {
     pub const fn get_table_fill_index() -> Self {
         Self(23)
     }
+    /// Returns an index for wasm's local `memory.atomic.wait32` builtin function.
+    pub const fn get_memory_atomic_wait32_index() -> Self {
+        Self(24)
+    }
+    /// Returns an index for wasm's imported `memory.atomic.wait32` builtin function.
+    pub const fn get_imported_memory_atomic_wait32_index() -> Self {
+        Self(25)
+    }
+    /// Returns an index for wasm's local `memory.atomic.wait64` builtin function.
+    pub const fn get_memory_atomic_wait64_index() -> Self {
+        Self(26)
+    }
+    /// Returns an index for wasm's imported `memory.atomic.wait64` builtin function.
+    pub const fn get_imported_memory_atomic_wait64_index() -> Self {
+        Self(27)
+    }
+    /// Returns an index for wasm's local `memory.atomic.notify` builtin function.
+    pub const fn get_memory_atomic_notify_index() -> Self {
+        Self(28)
+    }
+    /// Returns an index for wasm's imported `memory.atomic.notify` builtin function.
+    pub const fn get_imported_memory_atomic_notify_index() -> Self {
+        Self(29)
+    }
     /// Returns the total number of builtin functions.
     pub const fn builtin_functions_total_number() -> u32 {
-        24
+        30
     }
 
     /// Return the index as an u32 number.
diff --git a/lib/vm/src/instance/mod.rs b/lib/vm/src/instance/mod.rs
index dde412ff343..bf6fb783556 100644
--- a/lib/vm/src/instance/mod.rs
+++ b/lib/vm/src/instance/mod.rs
@@ -14,10 +14,10 @@ use crate::store::{InternalStoreHandle, StoreObjects};
 use crate::table::TableElement;
 use crate::trap::{catch_traps, Trap, TrapCode};
 use crate::vmcontext::{
-    memory_copy, memory_fill, VMBuiltinFunctionsArray, VMCallerCheckedAnyfunc, VMContext,
-    VMFunctionContext, VMFunctionImport, VMFunctionKind, VMGlobalDefinition, VMGlobalImport,
-    VMMemoryDefinition, VMMemoryImport, VMSharedSignatureIndex, VMTableDefinition, VMTableImport,
-    VMTrampoline,
+    memory32_atomic_check32, memory32_atomic_check64, memory_copy, memory_fill,
+    VMBuiltinFunctionsArray, VMCallerCheckedAnyfunc, VMContext, VMFunctionContext,
+    VMFunctionImport, VMFunctionKind, VMGlobalDefinition, VMGlobalImport, VMMemoryDefinition,
+    VMMemoryImport, VMSharedSignatureIndex, VMTableDefinition, VMTableImport, VMTrampoline,
 };
 use crate::LinearMemory;
 use crate::{FunctionBodyPtr, MaybeInstanceOwned, TrapHandlerFn, VMFunctionBody};
@@ -33,7 +33,8 @@ use std::fmt;
 use std::mem;
 use std::ptr::{self, NonNull};
 use std::slice;
-use std::sync::Arc;
+use std::sync::{Arc, Mutex};
+use std::thread::{current, park, park_timeout, Thread};
 use wasmer_types::entity::{packed_option::ReservedValue, BoxedSlice, EntityRef, PrimaryMap};
 use wasmer_types::{
     DataIndex, DataInitializer, ElemIndex, ExportIndex, FunctionIndex, GlobalIndex, GlobalInit,
@@ -48,6 +49,7 @@ use wasmer_types::{
 /// to ensure that the `vmctx` field is last. See the documentation of
 /// the `vmctx` field to learn more.
 #[repr(C)]
+#[allow(clippy::type_complexity)]
 pub(crate) struct Instance {
     /// The `ModuleInfo` this `Instance` was instantiated from.
     module: Arc<ModuleInfo>,
@@ -89,6 +91,9 @@ pub(crate) struct Instance {
     /// will point to elements here for functions imported by this instance.
     imported_funcrefs: BoxedSlice<FunctionIndex, NonNull<VMCallerCheckedAnyfunc>>,
 
+    /// The Hasmap with the Notify for the Notify/wait opcodes
+    conditions: Arc<Mutex<HashMap<(u32, u32), Vec<(Thread, bool)>>>>,
+
     /// Additional context used by compiled WebAssembly code. This
     /// field is last, and represents a dynamically-sized array that
     /// extends beyond the nominal end of the struct (similar to a
@@ -777,6 +782,195 @@ impl Instance {
             self.imported_table(table_index).handle
         }
     }
+
+    // To implement Wait / Notify, a HasMap, behind a mutex, will be used
+    // to track the address of waiter. The key of the hashmap is based on the memory
+    // and waiter threads are "park"'d (with or without timeout)
+    // Notify will wake the waiters by simply "unpark" the thread
+    // as the Thread info is stored on the HashMap
+    // once unparked, the waiter thread will remove it's mark on the HashMap
+    // timeout / awake is tracked with a boolean in the HashMap
+    // because `park_timeout` doesn't gives any information on why it returns
+    fn do_wait(&mut self, index: u32, dst: u32, timeout: i64) -> u32 {
+        // fetch the notifier
+        let key = (index, dst);
+        let mut conds = self.conditions.lock().unwrap();
+        conds.entry(key).or_insert_with(Vec::new);
+        let v = conds.get_mut(&key).unwrap();
+        v.push((current(), false));
+        drop(conds);
+        if timeout < 0 {
+            park();
+        } else {
+            park_timeout(std::time::Duration::from_nanos(timeout as u64));
+        }
+        let mut conds = self.conditions.lock().unwrap();
+        let v = conds.get_mut(&key).unwrap();
+        let id = current().id();
+        let mut ret = 0;
+        v.retain(|cond| {
+            if cond.0.id() == id {
+                ret = if cond.1 { 0 } else { 2 };
+                false
+            } else {
+                true
+            }
+        });
+        if v.is_empty() {
+            conds.remove(&key);
+        }
+        ret
+    }
+
+    /// Perform an Atomic.Wait32
+    pub(crate) fn local_memory_wait32(
+        &mut self,
+        memory_index: LocalMemoryIndex,
+        dst: u32,
+        val: u32,
+        timeout: i64,
+    ) -> Result<u32, Trap> {
+        let memory = self.memory(memory_index);
+        //if ! memory.shared {
+        // We should trap according to spec, but official test rely on not trapping...
+        //}
+
+        let ret = unsafe { memory32_atomic_check32(&memory, dst, val) };
+
+        if let Ok(mut ret) = ret {
+            if ret == 0 {
+                ret = self.do_wait(memory_index.as_u32(), dst, timeout);
+            }
+            Ok(ret)
+        } else {
+            ret
+        }
+    }
+
+    /// Perform an Atomic.Wait32
+    pub(crate) fn imported_memory_wait32(
+        &mut self,
+        memory_index: MemoryIndex,
+        dst: u32,
+        val: u32,
+        timeout: i64,
+    ) -> Result<u32, Trap> {
+        let import = self.imported_memory(memory_index);
+        let memory = unsafe { import.definition.as_ref() };
+        //if ! memory.shared {
+        // We should trap according to spec, but official test rely on not trapping...
+        //}
+
+        let ret = unsafe { memory32_atomic_check32(memory, dst, val) };
+
+        if let Ok(mut ret) = ret {
+            if ret == 0 {
+                ret = self.do_wait(memory_index.as_u32(), dst, timeout);
+            }
+            Ok(ret)
+        } else {
+            ret
+        }
+    }
+
+    /// Perform an Atomic.Wait64
+    pub(crate) fn local_memory_wait64(
+        &mut self,
+        memory_index: LocalMemoryIndex,
+        dst: u32,
+        val: u64,
+        timeout: i64,
+    ) -> Result<u32, Trap> {
+        let memory = self.memory(memory_index);
+        //if ! memory.shared {
+        // We should trap according to spec, but official test rely on not trapping...
+        //}
+
+        let ret = unsafe { memory32_atomic_check64(&memory, dst, val) };
+
+        if let Ok(mut ret) = ret {
+            if ret == 0 {
+                ret = self.do_wait(memory_index.as_u32(), dst, timeout);
+            }
+            Ok(ret)
+        } else {
+            ret
+        }
+    }
+
+    /// Perform an Atomic.Wait64
+    pub(crate) fn imported_memory_wait64(
+        &mut self,
+        memory_index: MemoryIndex,
+        dst: u32,
+        val: u64,
+        timeout: i64,
+    ) -> Result<u32, Trap> {
+        let import = self.imported_memory(memory_index);
+        let memory = unsafe { import.definition.as_ref() };
+        //if ! memory.shared {
+        // We should trap according to spec, but official test rely on not trapping...
+        //}
+
+        let ret = unsafe { memory32_atomic_check64(memory, dst, val) };
+
+        if let Ok(mut ret) = ret {
+            if ret == 0 {
+                ret = self.do_wait(memory_index.as_u32(), dst, timeout);
+            }
+            Ok(ret)
+        } else {
+            ret
+        }
+    }
+
+    /// Perform an Atomic.Notify
+    pub(crate) fn local_memory_notify(
+        &mut self,
+        memory_index: LocalMemoryIndex,
+        dst: u32,
+    ) -> Result<(), Trap> {
+        //let memory = self.memory(memory_index);
+        //if ! memory.shared {
+        // We should trap according to spec, but official test rely on not trapping...
+        //}
+
+        // fetch the notifier
+        let key = (memory_index.as_u32(), dst);
+        let mut conds = self.conditions.lock().unwrap();
+        if conds.contains_key(&key) {
+            let v = conds.get_mut(&key).unwrap();
+            for (t, b) in v {
+                *b = true; // mark as was waiked up
+                t.unpark(); // wakeup!
+            }
+        }
+        Ok(())
+    }
+    /// Perform an Atomic.Notify
+    pub(crate) fn imported_memory_notify(
+        &mut self,
+        memory_index: MemoryIndex,
+        dst: u32,
+    ) -> Result<(), Trap> {
+        //let import = self.imported_memory(memory_index);
+        //let memory = unsafe { import.definition.as_ref() };
+        //if ! memory.shared {
+        // We should trap according to spec, but official test rely on not trapping...
+        //}
+
+        // fetch the notifier
+        let key = (memory_index.as_u32(), dst);
+        let mut conds = self.conditions.lock().unwrap();
+        if conds.contains_key(&key) {
+            let v = conds.get_mut(&key).unwrap();
+            for (t, b) in v {
+                *b = true; // mark as was waiked up
+                t.unpark(); // wakeup!
+            }
+        }
+        Ok(())
+    }
 }
 
 /// A handle holding an `Instance` of a WebAssembly module.
@@ -869,6 +1063,7 @@ impl InstanceHandle {
                 funcrefs,
                 imported_funcrefs,
                 vmctx: VMContext {},
+                conditions: Arc::new(Mutex::new(HashMap::new())),
             };
 
             let mut instance_handle = allocator.write_instance(instance);
diff --git a/lib/vm/src/libcalls.rs b/lib/vm/src/libcalls.rs
index 9274237f167..a3f0107859c 100644
--- a/lib/vm/src/libcalls.rs
+++ b/lib/vm/src/libcalls.rs
@@ -667,6 +667,151 @@ pub unsafe extern "C" fn wasmer_vm_raise_trap(trap_code: TrapCode) -> ! {
 #[no_mangle]
 pub static wasmer_vm_probestack: unsafe extern "C" fn() = PROBESTACK;
 
+/// Implementation of memory.wait32 for locally-defined 32-bit memories.
+///
+/// # Safety
+///
+/// `vmctx` must be dereferenceable.
+#[no_mangle]
+pub unsafe extern "C" fn wasmer_vm_memory32_atomic_wait32(
+    vmctx: *mut VMContext,
+    memory_index: u32,
+    dst: u32,
+    val: u32,
+    timeout: i64,
+) -> u32 {
+    let result = {
+        let instance = (*vmctx).instance_mut();
+        let memory_index = LocalMemoryIndex::from_u32(memory_index);
+
+        instance.local_memory_wait32(memory_index, dst, val, timeout)
+    };
+    if let Err(trap) = result {
+        raise_lib_trap(trap);
+    }
+    result.unwrap()
+}
+
+/// Implementation of memory.wait32 for imported 32-bit memories.
+///
+/// # Safety
+///
+/// `vmctx` must be dereferenceable.
+#[no_mangle]
+pub unsafe extern "C" fn wasmer_vm_imported_memory32_atomic_wait32(
+    vmctx: *mut VMContext,
+    memory_index: u32,
+    dst: u32,
+    val: u32,
+    timeout: i64,
+) -> u32 {
+    let result = {
+        let instance = (*vmctx).instance_mut();
+        let memory_index = MemoryIndex::from_u32(memory_index);
+
+        instance.imported_memory_wait32(memory_index, dst, val, timeout)
+    };
+    if let Err(trap) = result {
+        raise_lib_trap(trap);
+    }
+    result.unwrap()
+}
+
+/// Implementation of memory.wait64 for locally-defined 32-bit memories.
+///
+/// # Safety
+///
+/// `vmctx` must be dereferenceable.
+#[no_mangle]
+pub unsafe extern "C" fn wasmer_vm_memory32_atomic_wait64(
+    vmctx: *mut VMContext,
+    memory_index: u32,
+    dst: u32,
+    val: u64,
+    timeout: i64,
+) -> u32 {
+    let result = {
+        let instance = (*vmctx).instance_mut();
+        let memory_index = LocalMemoryIndex::from_u32(memory_index);
+
+        instance.local_memory_wait64(memory_index, dst, val, timeout)
+    };
+    if let Err(trap) = result {
+        raise_lib_trap(trap);
+    }
+    result.unwrap()
+}
+
+/// Implementation of memory.wait64 for imported 32-bit memories.
+///
+/// # Safety
+///
+/// `vmctx` must be dereferenceable.
+#[no_mangle]
+pub unsafe extern "C" fn wasmer_vm_imported_memory32_atomic_wait64(
+    vmctx: *mut VMContext,
+    memory_index: u32,
+    dst: u32,
+    val: u64,
+    timeout: i64,
+) -> u32 {
+    let result = {
+        let instance = (*vmctx).instance_mut();
+        let memory_index = MemoryIndex::from_u32(memory_index);
+
+        instance.imported_memory_wait64(memory_index, dst, val, timeout)
+    };
+    if let Err(trap) = result {
+        raise_lib_trap(trap);
+    }
+    result.unwrap()
+}
+
+/// Implementation of memory.notfy for locally-defined 32-bit memories.
+///
+/// # Safety
+///
+/// `vmctx` must be dereferenceable.
+#[no_mangle]
+pub unsafe extern "C" fn wasmer_vm_memory32_atomic_notify(
+    vmctx: *mut VMContext,
+    memory_index: u32,
+    dst: u32,
+) {
+    let result = {
+        let instance = (*vmctx).instance_mut();
+        let memory_index = LocalMemoryIndex::from_u32(memory_index);
+
+        instance.local_memory_notify(memory_index, dst)
+    };
+    if let Err(trap) = result {
+        raise_lib_trap(trap);
+    }
+}
+
+/// Implementation of memory.notfy for imported 32-bit memories.
+///
+/// # Safety
+///
+/// `vmctx` must be dereferenceable.
+#[no_mangle]
+pub unsafe extern "C" fn wasmer_vm_imported_memory32_atomic_notify(
+    vmctx: *mut VMContext,
+    memory_index: u32,
+    dst: u32,
+) {
+    let result = {
+        let instance = (*vmctx).instance_mut();
+        let memory_index = MemoryIndex::from_u32(memory_index);
+
+        instance.imported_memory_notify(memory_index, dst)
+    };
+    if let Err(trap) = result {
+        raise_lib_trap(trap);
+    }
+    result.unwrap()
+}
+
 /// The function pointer to a libcall
 pub fn function_pointer(libcall: LibCall) -> usize {
     match libcall {
@@ -701,5 +846,11 @@ pub fn function_pointer(libcall: LibCall) -> usize {
         LibCall::DataDrop => wasmer_vm_data_drop as usize,
         LibCall::Probestack => wasmer_vm_probestack as usize,
         LibCall::RaiseTrap => wasmer_vm_raise_trap as usize,
+        LibCall::Memory32AtomicWait32 => wasmer_vm_memory32_atomic_wait32 as usize,
+        LibCall::ImportedMemory32AtomicWait32 => wasmer_vm_imported_memory32_atomic_wait32 as usize,
+        LibCall::Memory32AtomicWait64 => wasmer_vm_memory32_atomic_wait64 as usize,
+        LibCall::ImportedMemory32AtomicWait64 => wasmer_vm_imported_memory32_atomic_wait64 as usize,
+        LibCall::Memory32AtomicNotify => wasmer_vm_memory32_atomic_notify as usize,
+        LibCall::ImportedMemory32AtomicNotify => wasmer_vm_imported_memory32_atomic_notify as usize,
     }
 }
diff --git a/lib/vm/src/vmcontext.rs b/lib/vm/src/vmcontext.rs
index 766a8708d1d..eb16620096a 100644
--- a/lib/vm/src/vmcontext.rs
+++ b/lib/vm/src/vmcontext.rs
@@ -14,6 +14,7 @@ use crate::VMTable;
 use crate::{VMBuiltinFunctionIndex, VMFunction};
 use std::convert::TryFrom;
 use std::ptr::{self, NonNull};
+use std::sync::atomic::{AtomicPtr, Ordering};
 use std::u32;
 use wasmer_types::RawValue;
 
@@ -376,6 +377,62 @@ pub(crate) unsafe fn memory_fill(
     Ok(())
 }
 
+/// Perform the `memory32.atomic.check32` operation for the memory. Return 0 if same, 1 if different
+///
+/// # Errors
+///
+/// Returns a `Trap` error if the memory range is out of bounds.
+///
+/// # Safety
+/// memory access is unsafe
+pub(crate) unsafe fn memory32_atomic_check32(
+    mem: &VMMemoryDefinition,
+    dst: u32,
+    val: u32,
+) -> Result<u32, Trap> {
+    if usize::try_from(dst).unwrap() > mem.current_length {
+        return Err(Trap::lib(TrapCode::HeapAccessOutOfBounds));
+    }
+
+    let dst = isize::try_from(dst).unwrap();
+
+    // Bounds and casts are checked above, by this point we know that
+    // everything is safe.
+    let dst = mem.base.offset(dst) as *mut u32;
+    let atomic_dst = AtomicPtr::new(dst);
+    let read_val = *atomic_dst.load(Ordering::Acquire);
+    let ret = if read_val == val { 0 } else { 1 };
+    Ok(ret)
+}
+
+/// Perform the `memory32.atomic.check64` operation for the memory. Return 0 if same, 1 if different
+///
+/// # Errors
+///
+/// Returns a `Trap` error if the memory range is out of bounds.
+///
+/// # Safety
+/// memory access is unsafe
+pub(crate) unsafe fn memory32_atomic_check64(
+    mem: &VMMemoryDefinition,
+    dst: u32,
+    val: u64,
+) -> Result<u32, Trap> {
+    if usize::try_from(dst).unwrap() > mem.current_length {
+        return Err(Trap::lib(TrapCode::HeapAccessOutOfBounds));
+    }
+
+    let dst = isize::try_from(dst).unwrap();
+
+    // Bounds and casts are checked above, by this point we know that
+    // everything is safe.
+    let dst = mem.base.offset(dst) as *mut u64;
+    let atomic_dst = AtomicPtr::new(dst);
+    let read_val = *atomic_dst.load(Ordering::Acquire);
+    let ret = if read_val == val { 0 } else { 1 };
+    Ok(ret)
+}
+
 /// The fields compiled code needs to access to utilize a WebAssembly table
 /// defined within the instance.
 #[derive(Debug, Clone, Copy)]
@@ -634,6 +691,19 @@ impl VMBuiltinFunctionsArray {
         ptrs[VMBuiltinFunctionIndex::get_table_fill_index().index() as usize] =
             wasmer_vm_table_fill as usize;
 
+        ptrs[VMBuiltinFunctionIndex::get_memory_atomic_wait32_index().index() as usize] =
+            wasmer_vm_memory32_atomic_wait32 as usize;
+        ptrs[VMBuiltinFunctionIndex::get_imported_memory_atomic_wait32_index().index() as usize] =
+            wasmer_vm_imported_memory32_atomic_wait32 as usize;
+        ptrs[VMBuiltinFunctionIndex::get_memory_atomic_wait64_index().index() as usize] =
+            wasmer_vm_memory32_atomic_wait64 as usize;
+        ptrs[VMBuiltinFunctionIndex::get_imported_memory_atomic_wait64_index().index() as usize] =
+            wasmer_vm_imported_memory32_atomic_wait64 as usize;
+        ptrs[VMBuiltinFunctionIndex::get_memory_atomic_notify_index().index() as usize] =
+            wasmer_vm_memory32_atomic_notify as usize;
+        ptrs[VMBuiltinFunctionIndex::get_imported_memory_atomic_notify_index().index() as usize] =
+            wasmer_vm_imported_memory32_atomic_notify as usize;
+
         debug_assert!(ptrs.iter().cloned().all(|p| p != 0));
 
         Self { ptrs }

From bfc8d9478b2b23ce1301cac519f52a59ca2f081d Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Tue, 6 Sep 2022 10:10:53 +0200
Subject: [PATCH 06/24] Added Wait/Notify opcode handling to Sinbglepass, and
 enable x86_64 threads::atomic test (for #3158)

---
 lib/compiler-singlepass/src/codegen.rs     | 167 +++++++++++++++++++++
 lib/compiler-singlepass/src/emitter_x64.rs |   3 +
 lib/compiler-singlepass/src/machine_x64.rs |   3 +-
 tests/ignores.txt                          |   2 +-
 4 files changed, 173 insertions(+), 2 deletions(-)

diff --git a/lib/compiler-singlepass/src/codegen.rs b/lib/compiler-singlepass/src/codegen.rs
index 9e39b9964ba..05de4e57d76 100644
--- a/lib/compiler-singlepass/src/codegen.rs
+++ b/lib/compiler-singlepass/src/codegen.rs
@@ -6417,6 +6417,173 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     [WpType::I32].iter().cloned(),
                 )?;
             }
+            Operator::MemoryAtomicWait32 { ref memarg } => {
+                let timeout = self.value_stack.pop().unwrap();
+                let val = self.value_stack.pop().unwrap();
+                let dst = self.value_stack.pop().unwrap();
+                self.release_locations_only_regs(&[timeout, val, dst])?;
+
+                let memory_index = MemoryIndex::new(memarg.memory as usize);
+                let (memory_atomic_wait32, memory_index) =
+                    if self.module.local_memory_index(memory_index).is_some() {
+                        (
+                            VMBuiltinFunctionIndex::get_memory_atomic_wait32_index(),
+                            memory_index,
+                        )
+                    } else {
+                        (
+                            VMBuiltinFunctionIndex::get_imported_memory_atomic_wait32_index(),
+                            memory_index,
+                        )
+                    };
+
+                self.machine.move_location(
+                    Size::S64,
+                    Location::Memory(
+                        self.machine.get_vmctx_reg(),
+                        self.vmoffsets.vmctx_builtin_function(memory_atomic_wait32) as i32,
+                    ),
+                    Location::GPR(self.machine.get_grp_for_call()),
+                )?;
+
+                // TODO: should this be 3?
+                self.release_locations_only_osr_state(1)?;
+
+                self.emit_call_native(
+                    |this| {
+                        this.machine
+                            .emit_call_register(this.machine.get_grp_for_call())
+                    },
+                    // [vmctx, memory_index, dst, src, timeout]
+                    [
+                        Location::Imm32(memory_index.index() as u32),
+                        dst,
+                        val,
+                        timeout,
+                    ]
+                    .iter()
+                    .cloned(),
+                    [WpType::I32, WpType::I32, WpType::I32, WpType::I64]
+                        .iter()
+                        .cloned(),
+                )?;
+                self.release_locations_only_stack(&[dst, val, timeout])?;
+                let ret = self.acquire_locations(
+                    &[(WpType::I32, MachineValue::WasmStack(self.value_stack.len()))],
+                    false,
+                )?[0];
+                self.value_stack.push(ret);
+                self.machine.move_location(
+                    Size::S32,
+                    Location::GPR(self.machine.get_gpr_for_ret()),
+                    ret,
+                )?;
+            }
+            Operator::MemoryAtomicWait64 { ref memarg } => {
+                let timeout = self.value_stack.pop().unwrap();
+                let val = self.value_stack.pop().unwrap();
+                let dst = self.value_stack.pop().unwrap();
+                self.release_locations_only_regs(&[timeout, val, dst])?;
+
+                let memory_index = MemoryIndex::new(memarg.memory as usize);
+                let (memory_atomic_wait64, memory_index) =
+                    if self.module.local_memory_index(memory_index).is_some() {
+                        (
+                            VMBuiltinFunctionIndex::get_memory_atomic_wait64_index(),
+                            memory_index,
+                        )
+                    } else {
+                        (
+                            VMBuiltinFunctionIndex::get_imported_memory_atomic_wait64_index(),
+                            memory_index,
+                        )
+                    };
+
+                self.machine.move_location(
+                    Size::S64,
+                    Location::Memory(
+                        self.machine.get_vmctx_reg(),
+                        self.vmoffsets.vmctx_builtin_function(memory_atomic_wait64) as i32,
+                    ),
+                    Location::GPR(self.machine.get_grp_for_call()),
+                )?;
+
+                // TODO: should this be 3?
+                self.release_locations_only_osr_state(1)?;
+
+                self.emit_call_native(
+                    |this| {
+                        this.machine
+                            .emit_call_register(this.machine.get_grp_for_call())
+                    },
+                    // [vmctx, memory_index, dst, src, timeout]
+                    [
+                        Location::Imm32(memory_index.index() as u32),
+                        dst,
+                        val,
+                        timeout,
+                    ]
+                    .iter()
+                    .cloned(),
+                    [WpType::I32, WpType::I32, WpType::I64, WpType::I64]
+                        .iter()
+                        .cloned(),
+                )?;
+                self.release_locations_only_stack(&[dst, val, timeout])?;
+                let ret = self.acquire_locations(
+                    &[(WpType::I32, MachineValue::WasmStack(self.value_stack.len()))],
+                    false,
+                )?[0];
+                self.value_stack.push(ret);
+                self.machine.move_location(
+                    Size::S32,
+                    Location::GPR(self.machine.get_gpr_for_ret()),
+                    ret,
+                )?;
+            }
+            Operator::MemoryAtomicNotify { ref memarg } => {
+                let dst = self.value_stack.pop().unwrap();
+                self.release_locations_only_regs(&[dst])?;
+
+                let memory_index = MemoryIndex::new(memarg.memory as usize);
+                let (memory_atomic_wait32, memory_index) =
+                    if self.module.local_memory_index(memory_index).is_some() {
+                        (
+                            VMBuiltinFunctionIndex::get_memory_atomic_wait32_index(),
+                            memory_index,
+                        )
+                    } else {
+                        (
+                            VMBuiltinFunctionIndex::get_imported_memory_atomic_wait32_index(),
+                            memory_index,
+                        )
+                    };
+
+                self.machine.move_location(
+                    Size::S64,
+                    Location::Memory(
+                        self.machine.get_vmctx_reg(),
+                        self.vmoffsets.vmctx_builtin_function(memory_atomic_wait32) as i32,
+                    ),
+                    Location::GPR(self.machine.get_grp_for_call()),
+                )?;
+
+                // TODO: should this be 3?
+                self.release_locations_only_osr_state(1)?;
+
+                self.emit_call_native(
+                    |this| {
+                        this.machine
+                            .emit_call_register(this.machine.get_grp_for_call())
+                    },
+                    // [vmctx, memory_index, dst, src, timeout]
+                    [Location::Imm32(memory_index.index() as u32), dst]
+                        .iter()
+                        .cloned(),
+                    [WpType::I32, WpType::I32].iter().cloned(),
+                )?;
+                self.release_locations_only_stack(&[dst])?;
+            }
             _ => {
                 return Err(CompileError::Codegen(format!(
                     "not yet implemented: {:?}",
diff --git a/lib/compiler-singlepass/src/emitter_x64.rs b/lib/compiler-singlepass/src/emitter_x64.rs
index 736f8fb48ee..4f4297db120 100644
--- a/lib/compiler-singlepass/src/emitter_x64.rs
+++ b/lib/compiler-singlepass/src/emitter_x64.rs
@@ -1430,6 +1430,9 @@ impl EmitterX64 for AssemblerX64 {
             (Size::S32, Location::Imm64(imm), Size::S64, Location::GPR(dst)) => {
                 dynasm!(self ; mov Rq(dst as u8), imm as i32);
             }
+            (Size::S16, Location::Imm64(imm), Size::S64, Location::GPR(dst)) => {
+                dynasm!(self ; mov Rq(dst as u8), imm as i32);
+            }
             _ => {
                 codegen_error!(
                     "singlepass can't emit MOVZX {:?} {:?} {:?} {:?}",
diff --git a/lib/compiler-singlepass/src/machine_x64.rs b/lib/compiler-singlepass/src/machine_x64.rs
index 511b1ce8878..dad625c69aa 100644
--- a/lib/compiler-singlepass/src/machine_x64.rs
+++ b/lib/compiler-singlepass/src/machine_x64.rs
@@ -2415,7 +2415,8 @@ impl Machine for MachineX86_64 {
             Location::GPR(_)
             | Location::Memory(_, _)
             | Location::Memory2(_, _, _, _)
-            | Location::Imm32(_) => match size_val {
+            | Location::Imm32(_)
+            | Location::Imm64(_) => match size_val {
                 Size::S32 | Size::S64 => self.assembler.emit_mov(size_val, source, dst),
                 Size::S16 | Size::S8 => {
                     if signed {
diff --git a/tests/ignores.txt b/tests/ignores.txt
index d3b4332ca0f..071e89145f1 100644
--- a/tests/ignores.txt
+++ b/tests/ignores.txt
@@ -24,7 +24,7 @@ llvm       traps::start_trap_pretty
 cranelift+aarch64+macos    traps::start_trap_pretty
 
 # Atomics (WIP)
-singlepass spec::threads::atomic
+singlepass+aarch64 spec::threads::atomic
 singlepass spec::threads::imports
 cranelift spec::threads::atomic
 cranelift spec::threads::imports

From 94725768f587d6b8f1d6343ae418eb4346d8cd0b Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Tue, 6 Sep 2022 13:24:47 +0200
Subject: [PATCH 07/24] Map cranelift HeapMisaligned to wasmer UnalignedAtomic
 trap code (for #3162)

---
 lib/compiler-cranelift/src/compiler.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/compiler-cranelift/src/compiler.rs b/lib/compiler-cranelift/src/compiler.rs
index 0303e963b2b..807c07fab9b 100644
--- a/lib/compiler-cranelift/src/compiler.rs
+++ b/lib/compiler-cranelift/src/compiler.rs
@@ -430,7 +430,7 @@ fn translate_ir_trapcode(trap: ir::TrapCode) -> TrapCode {
     match trap {
         ir::TrapCode::StackOverflow => TrapCode::StackOverflow,
         ir::TrapCode::HeapOutOfBounds => TrapCode::HeapAccessOutOfBounds,
-        ir::TrapCode::HeapMisaligned => TrapCode::HeapMisaligned,
+        ir::TrapCode::HeapMisaligned => TrapCode::UnalignedAtomic,
         ir::TrapCode::TableOutOfBounds => TrapCode::TableAccessOutOfBounds,
         ir::TrapCode::IndirectCallToNull => TrapCode::IndirectCallToNull,
         ir::TrapCode::BadSignature => TrapCode::BadSignature,

From bb69903c5176a9c98c3db6032265f1d13eb2a571 Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Tue, 6 Sep 2022 14:13:40 +0200
Subject: [PATCH 08/24] Fixed Notify helper funciton and opcode (for #3155 and
 #3158)

---
 lib/compiler-singlepass/src/codegen.rs | 23 +++++++++++++++++------
 lib/vm/src/instance/mod.rs             | 26 ++++++++++++++++++--------
 lib/vm/src/libcalls.rs                 | 11 +++++++----
 3 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/lib/compiler-singlepass/src/codegen.rs b/lib/compiler-singlepass/src/codegen.rs
index 05de4e57d76..9b24e3be68f 100644
--- a/lib/compiler-singlepass/src/codegen.rs
+++ b/lib/compiler-singlepass/src/codegen.rs
@@ -6542,19 +6542,20 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 )?;
             }
             Operator::MemoryAtomicNotify { ref memarg } => {
+                let cnt = self.value_stack.pop().unwrap();
                 let dst = self.value_stack.pop().unwrap();
-                self.release_locations_only_regs(&[dst])?;
+                self.release_locations_only_regs(&[cnt, dst])?;
 
                 let memory_index = MemoryIndex::new(memarg.memory as usize);
-                let (memory_atomic_wait32, memory_index) =
+                let (memory_atomic_notify, memory_index) =
                     if self.module.local_memory_index(memory_index).is_some() {
                         (
-                            VMBuiltinFunctionIndex::get_memory_atomic_wait32_index(),
+                            VMBuiltinFunctionIndex::get_memory_atomic_notify_index(),
                             memory_index,
                         )
                     } else {
                         (
-                            VMBuiltinFunctionIndex::get_imported_memory_atomic_wait32_index(),
+                            VMBuiltinFunctionIndex::get_imported_memory_atomic_notify_index(),
                             memory_index,
                         )
                     };
@@ -6563,7 +6564,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     Size::S64,
                     Location::Memory(
                         self.machine.get_vmctx_reg(),
-                        self.vmoffsets.vmctx_builtin_function(memory_atomic_wait32) as i32,
+                        self.vmoffsets.vmctx_builtin_function(memory_atomic_notify) as i32,
                     ),
                     Location::GPR(self.machine.get_grp_for_call()),
                 )?;
@@ -6582,7 +6583,17 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                         .cloned(),
                     [WpType::I32, WpType::I32].iter().cloned(),
                 )?;
-                self.release_locations_only_stack(&[dst])?;
+                self.release_locations_only_stack(&[dst, cnt])?;
+                let ret = self.acquire_locations(
+                    &[(WpType::I32, MachineValue::WasmStack(self.value_stack.len()))],
+                    false,
+                )?[0];
+                self.value_stack.push(ret);
+                self.machine.move_location(
+                    Size::S32,
+                    Location::GPR(self.machine.get_gpr_for_ret()),
+                    ret,
+                )?;
             }
             _ => {
                 return Err(CompileError::Codegen(format!(
diff --git a/lib/vm/src/instance/mod.rs b/lib/vm/src/instance/mod.rs
index bf6fb783556..878b50c47ec 100644
--- a/lib/vm/src/instance/mod.rs
+++ b/lib/vm/src/instance/mod.rs
@@ -929,7 +929,8 @@ impl Instance {
         &mut self,
         memory_index: LocalMemoryIndex,
         dst: u32,
-    ) -> Result<(), Trap> {
+        count: u32,
+    ) -> Result<u32, Trap> {
         //let memory = self.memory(memory_index);
         //if ! memory.shared {
         // We should trap according to spec, but official test rely on not trapping...
@@ -938,21 +939,26 @@ impl Instance {
         // fetch the notifier
         let key = (memory_index.as_u32(), dst);
         let mut conds = self.conditions.lock().unwrap();
+        let mut cnt = 0u32;
         if conds.contains_key(&key) {
             let v = conds.get_mut(&key).unwrap();
             for (t, b) in v {
-                *b = true; // mark as was waiked up
-                t.unpark(); // wakeup!
+                if cnt < count {
+                    *b = true; // mark as was waiked up
+                    t.unpark(); // wakeup!
+                    cnt += 1;
+                }
             }
         }
-        Ok(())
+        Ok(cnt)
     }
     /// Perform an Atomic.Notify
     pub(crate) fn imported_memory_notify(
         &mut self,
         memory_index: MemoryIndex,
         dst: u32,
-    ) -> Result<(), Trap> {
+        count: u32,
+    ) -> Result<u32, Trap> {
         //let import = self.imported_memory(memory_index);
         //let memory = unsafe { import.definition.as_ref() };
         //if ! memory.shared {
@@ -962,14 +968,18 @@ impl Instance {
         // fetch the notifier
         let key = (memory_index.as_u32(), dst);
         let mut conds = self.conditions.lock().unwrap();
+        let mut cnt = 0u32;
         if conds.contains_key(&key) {
             let v = conds.get_mut(&key).unwrap();
             for (t, b) in v {
-                *b = true; // mark as was waiked up
-                t.unpark(); // wakeup!
+                if cnt < count {
+                    *b = true; // mark as was waiked up
+                    t.unpark(); // wakeup!
+                    cnt += 1;
+                }
             }
         }
-        Ok(())
+        Ok(cnt)
     }
 }
 
diff --git a/lib/vm/src/libcalls.rs b/lib/vm/src/libcalls.rs
index a3f0107859c..67523a14454 100644
--- a/lib/vm/src/libcalls.rs
+++ b/lib/vm/src/libcalls.rs
@@ -777,16 +777,18 @@ pub unsafe extern "C" fn wasmer_vm_memory32_atomic_notify(
     vmctx: *mut VMContext,
     memory_index: u32,
     dst: u32,
-) {
+    cnt: u32,
+) -> u32 {
     let result = {
         let instance = (*vmctx).instance_mut();
         let memory_index = LocalMemoryIndex::from_u32(memory_index);
 
-        instance.local_memory_notify(memory_index, dst)
+        instance.local_memory_notify(memory_index, dst, cnt)
     };
     if let Err(trap) = result {
         raise_lib_trap(trap);
     }
+    result.unwrap()
 }
 
 /// Implementation of memory.notfy for imported 32-bit memories.
@@ -799,12 +801,13 @@ pub unsafe extern "C" fn wasmer_vm_imported_memory32_atomic_notify(
     vmctx: *mut VMContext,
     memory_index: u32,
     dst: u32,
-) {
+    cnt: u32,
+) -> u32 {
     let result = {
         let instance = (*vmctx).instance_mut();
         let memory_index = MemoryIndex::from_u32(memory_index);
 
-        instance.imported_memory_notify(memory_index, dst)
+        instance.imported_memory_notify(memory_index, dst, cnt)
     };
     if let Err(trap) = result {
         raise_lib_trap(trap);

From 516da67ce8c5451b02f3a729f537e4daa81ed970 Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Tue, 6 Sep 2022 14:57:26 +0200
Subject: [PATCH 09/24] Added Wait/Notify opcodes for cranelift compiler (for
 #3156)

---
 lib/compiler-cranelift/src/func_environ.rs | 189 +++++++++++++++++++--
 tests/ignores.txt                          |   1 -
 2 files changed, 174 insertions(+), 16 deletions(-)

diff --git a/lib/compiler-cranelift/src/func_environ.rs b/lib/compiler-cranelift/src/func_environ.rs
index afb6010d4d9..0608b7add7b 100644
--- a/lib/compiler-cranelift/src/func_environ.rs
+++ b/lib/compiler-cranelift/src/func_environ.rs
@@ -104,6 +104,15 @@ pub struct FuncEnvironment<'module_environment> {
     /// The external function signature for implementing wasm's `table.fill`.
     table_fill_sig: Option<ir::SigRef>,
 
+    /// The external function signature for implementing wasm's `memory32.atomic.wait32`.
+    memory32_atomic_wait32_sig: Option<ir::SigRef>,
+
+    /// The external function signature for implementing wasm's `memory32.atomic.wait64`.
+    memory32_atomic_wait64_sig: Option<ir::SigRef>,
+
+    /// The external function signature for implementing wasm's `memory32.atomic.notify`.
+    memory32_atomic_notify_sig: Option<ir::SigRef>,
+
     /// Offsets to struct fields accessed by JIT code.
     offsets: VMOffsets,
 
@@ -143,6 +152,9 @@ impl<'module_environment> FuncEnvironment<'module_environment> {
             data_drop_sig: None,
             func_ref_sig: None,
             table_fill_sig: None,
+            memory32_atomic_wait32_sig: None,
+            memory32_atomic_wait64_sig: None,
+            memory32_atomic_notify_sig: None,
             offsets: VMOffsets::new(target_config.pointer_bytes(), module),
             memory_styles,
             table_styles,
@@ -684,6 +696,139 @@ impl<'module_environment> FuncEnvironment<'module_environment> {
         (sig, VMBuiltinFunctionIndex::get_data_drop_index())
     }
 
+    fn get_memory32_atomic_wait32_sig(&mut self, func: &mut Function) -> ir::SigRef {
+        let sig = self.memory32_atomic_wait32_sig.unwrap_or_else(|| {
+            func.import_signature(Signature {
+                params: vec![
+                    AbiParam::special(self.pointer_type(), ArgumentPurpose::VMContext),
+                    // Memory Index
+                    AbiParam::new(I32),
+                    // Dst
+                    AbiParam::new(I32),
+                    // Val
+                    AbiParam::new(I32),
+                    // Timeout
+                    AbiParam::new(I64),
+                ],
+                returns: vec![AbiParam::new(I32)],
+                call_conv: self.target_config.default_call_conv,
+            })
+        });
+        self.memory32_atomic_wait32_sig = Some(sig);
+        sig
+    }
+
+    /// Return the memory.atomic.wait32 function signature to call for the given index,
+    /// along with the translated index value to pass to it
+    /// and its index in `VMBuiltinFunctionsArray`.
+    fn get_memory_atomic_wait32_func(
+        &mut self,
+        func: &mut Function,
+        index: MemoryIndex,
+    ) -> (ir::SigRef, usize, VMBuiltinFunctionIndex) {
+        if self.module.is_imported_memory(index) {
+            (
+                self.get_memory32_atomic_wait32_sig(func),
+                index.index(),
+                VMBuiltinFunctionIndex::get_imported_memory_atomic_wait32_index(),
+            )
+        } else {
+            (
+                self.get_memory32_atomic_wait32_sig(func),
+                self.module.local_memory_index(index).unwrap().index(),
+                VMBuiltinFunctionIndex::get_memory_atomic_wait32_index(),
+            )
+        }
+    }
+
+    fn get_memory32_atomic_wait64_sig(&mut self, func: &mut Function) -> ir::SigRef {
+        let sig = self.memory32_atomic_wait64_sig.unwrap_or_else(|| {
+            func.import_signature(Signature {
+                params: vec![
+                    AbiParam::special(self.pointer_type(), ArgumentPurpose::VMContext),
+                    // Memory Index
+                    AbiParam::new(I32),
+                    // Dst
+                    AbiParam::new(I32),
+                    // Val
+                    AbiParam::new(I64),
+                    // Timeout
+                    AbiParam::new(I64),
+                ],
+                returns: vec![AbiParam::new(I32)],
+                call_conv: self.target_config.default_call_conv,
+            })
+        });
+        self.memory32_atomic_wait64_sig = Some(sig);
+        sig
+    }
+
+    /// Return the memory.atomic.wait64 function signature to call for the given index,
+    /// along with the translated index value to pass to it
+    /// and its index in `VMBuiltinFunctionsArray`.
+    fn get_memory_atomic_wait64_func(
+        &mut self,
+        func: &mut Function,
+        index: MemoryIndex,
+    ) -> (ir::SigRef, usize, VMBuiltinFunctionIndex) {
+        if self.module.is_imported_memory(index) {
+            (
+                self.get_memory32_atomic_wait64_sig(func),
+                index.index(),
+                VMBuiltinFunctionIndex::get_imported_memory_atomic_wait64_index(),
+            )
+        } else {
+            (
+                self.get_memory32_atomic_wait64_sig(func),
+                self.module.local_memory_index(index).unwrap().index(),
+                VMBuiltinFunctionIndex::get_memory_atomic_wait64_index(),
+            )
+        }
+    }
+
+    fn get_memory32_atomic_notify_sig(&mut self, func: &mut Function) -> ir::SigRef {
+        let sig = self.memory32_atomic_notify_sig.unwrap_or_else(|| {
+            func.import_signature(Signature {
+                params: vec![
+                    AbiParam::special(self.pointer_type(), ArgumentPurpose::VMContext),
+                    // Memory Index
+                    AbiParam::new(I32),
+                    // Dst
+                    AbiParam::new(I32),
+                    // Count
+                    AbiParam::new(I32),
+                ],
+                returns: vec![AbiParam::new(I32)],
+                call_conv: self.target_config.default_call_conv,
+            })
+        });
+        self.memory32_atomic_notify_sig = Some(sig);
+        sig
+    }
+
+    /// Return the memory.atomic.notify function signature to call for the given index,
+    /// along with the translated index value to pass to it
+    /// and its index in `VMBuiltinFunctionsArray`.
+    fn get_memory_atomic_notify_func(
+        &mut self,
+        func: &mut Function,
+        index: MemoryIndex,
+    ) -> (ir::SigRef, usize, VMBuiltinFunctionIndex) {
+        if self.module.is_imported_memory(index) {
+            (
+                self.get_memory32_atomic_notify_sig(func),
+                index.index(),
+                VMBuiltinFunctionIndex::get_imported_memory_atomic_notify_index(),
+            )
+        } else {
+            (
+                self.get_memory32_atomic_notify_sig(func),
+                self.module.local_memory_index(index).unwrap().index(),
+                VMBuiltinFunctionIndex::get_memory_atomic_notify_index(),
+            )
+        }
+    }
+
     /// Translates load of builtin function and returns a pair of values `vmctx`
     /// and address of the loaded function.
     fn translate_load_builtin_function_address(
@@ -1389,29 +1534,43 @@ impl<'module_environment> BaseFuncEnvironment for FuncEnvironment<'module_enviro
 
     fn translate_atomic_wait(
         &mut self,
-        _pos: FuncCursor,
-        _index: MemoryIndex,
+        mut pos: FuncCursor,
+        index: MemoryIndex,
         _heap: ir::Heap,
-        _addr: ir::Value,
-        _expected: ir::Value,
-        _timeout: ir::Value,
+        addr: ir::Value,
+        expected: ir::Value,
+        timeout: ir::Value,
     ) -> WasmResult<ir::Value> {
-        Err(WasmError::Unsupported(
-            "wasm atomics (fn translate_atomic_wait)".to_string(),
-        ))
+        let (func_sig, index_arg, func_idx) = if pos.func.dfg.value_type(expected) == I64 {
+            self.get_memory_atomic_wait64_func(pos.func, index)
+        } else {
+            self.get_memory_atomic_wait32_func(pos.func, index)
+        };
+        let memory_index = pos.ins().iconst(I32, index_arg as i64);
+        let (vmctx, func_addr) = self.translate_load_builtin_function_address(&mut pos, func_idx);
+        let call_inst = pos.ins().call_indirect(
+            func_sig,
+            func_addr,
+            &[vmctx, memory_index, addr, expected, timeout],
+        );
+        Ok(*pos.func.dfg.inst_results(call_inst).first().unwrap())
     }
 
     fn translate_atomic_notify(
         &mut self,
-        _pos: FuncCursor,
-        _index: MemoryIndex,
+        mut pos: FuncCursor,
+        index: MemoryIndex,
         _heap: ir::Heap,
-        _addr: ir::Value,
-        _count: ir::Value,
+        addr: ir::Value,
+        count: ir::Value,
     ) -> WasmResult<ir::Value> {
-        Err(WasmError::Unsupported(
-            "wasm atomics (fn translate_atomic_notify)".to_string(),
-        ))
+        let (func_sig, index_arg, func_idx) = self.get_memory_atomic_notify_func(pos.func, index);
+        let memory_index = pos.ins().iconst(I32, index_arg as i64);
+        let (vmctx, func_addr) = self.translate_load_builtin_function_address(&mut pos, func_idx);
+        let call_inst =
+            pos.ins()
+                .call_indirect(func_sig, func_addr, &[vmctx, memory_index, addr, count]);
+        Ok(*pos.func.dfg.inst_results(call_inst).first().unwrap())
     }
 
     fn get_global_type(&self, global_index: GlobalIndex) -> Option<WasmerType> {
diff --git a/tests/ignores.txt b/tests/ignores.txt
index 071e89145f1..96b081b7b04 100644
--- a/tests/ignores.txt
+++ b/tests/ignores.txt
@@ -26,7 +26,6 @@ cranelift+aarch64+macos    traps::start_trap_pretty
 # Atomics (WIP)
 singlepass+aarch64 spec::threads::atomic
 singlepass spec::threads::imports
-cranelift spec::threads::atomic
 cranelift spec::threads::imports
 llvm spec::threads::atomic
 llvm spec::threads::imports

From 04a03fc5247d5f20ecef432b51befec040550dc4 Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Wed, 7 Sep 2022 10:36:44 +0200
Subject: [PATCH 10/24] Fixed clippy warning on using Self

---
 lib/vm/src/memory.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/vm/src/memory.rs b/lib/vm/src/memory.rs
index 99d69d9e363..c4370d474eb 100644
--- a/lib/vm/src/memory.rs
+++ b/lib/vm/src/memory.rs
@@ -453,7 +453,7 @@ impl VMMemory {
     ///
     /// This creates a `Memory` with owned metadata: this can be used to create a memory
     /// that will be imported into Wasm modules.
-    pub fn new(memory: &MemoryType, style: &MemoryStyle) -> Result<VMMemory, MemoryError> {
+    pub fn new(memory: &MemoryType, style: &MemoryStyle) -> Result<Self, MemoryError> {
         Ok(if memory.shared {
             Self(Box::new(VMSharedMemory::new(memory, style)?))
         } else {
@@ -477,7 +477,7 @@ impl VMMemory {
         memory: &MemoryType,
         style: &MemoryStyle,
         vm_memory_location: NonNull<VMMemoryDefinition>,
-    ) -> Result<VMMemory, MemoryError> {
+    ) -> Result<Self, MemoryError> {
         Ok(if memory.shared {
             Self(Box::new(VMSharedMemory::from_definition(
                 memory,

From 6ed8e4832efa03d2c1a3d8f9519e82cd48d6e918 Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Wed, 7 Sep 2022 15:05:09 +0200
Subject: [PATCH 11/24] Added Wait / Notify opcode to LLVM compiler (for #3157)

---
 lib/compiler-llvm/src/object_file.rs          |  24 ++
 lib/compiler-llvm/src/translator/code.rs      |  53 ++++
 .../src/translator/intrinsics.rs              | 236 ++++++++++++++++++
 3 files changed, 313 insertions(+)

diff --git a/lib/compiler-llvm/src/object_file.rs b/lib/compiler-llvm/src/object_file.rs
index 2807c36338e..5e5734e9728 100644
--- a/lib/compiler-llvm/src/object_file.rs
+++ b/lib/compiler-llvm/src/object_file.rs
@@ -96,6 +96,30 @@ where
     libcalls.insert("wasmer_vm_memory32_init".to_string(), LibCall::Memory32Init);
     libcalls.insert("wasmer_vm_data_drop".to_string(), LibCall::DataDrop);
     libcalls.insert("wasmer_vm_raise_trap".to_string(), LibCall::RaiseTrap);
+    libcalls.insert(
+        "wasmer_vm_memory32_atomic_wait32".to_string(),
+        LibCall::Memory32AtomicWait32,
+    );
+    libcalls.insert(
+        "wasmer_vm_imported_memory32_atomic_wait32".to_string(),
+        LibCall::ImportedMemory32AtomicWait32,
+    );
+    libcalls.insert(
+        "wasmer_vm_memory32_atomic_wait64".to_string(),
+        LibCall::Memory32AtomicWait64,
+    );
+    libcalls.insert(
+        "wasmer_vm_imported_memory32_atomic_wait64".to_string(),
+        LibCall::ImportedMemory32AtomicWait64,
+    );
+    libcalls.insert(
+        "wasmer_vm_memory32_atomic_notify".to_string(),
+        LibCall::Memory32AtomicNotify,
+    );
+    libcalls.insert(
+        "wasmer_vm_imported_memory32_atomic_notify".to_string(),
+        LibCall::ImportedMemory32AtomicNotify,
+    );
 
     let elf = object::File::parse(contents).map_err(map_object_err)?;
 
diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index 0d372e95e04..8dc74b0a57b 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -11231,6 +11231,59 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     .unwrap();
                 self.state.push1(size);
             }
+            Operator::MemoryAtomicWait32 { memarg } => {
+                let memory_index = MemoryIndex::from_u32(memarg.memory);
+                let (dst, val, timeout) = self.state.pop3()?;
+                let wait32_fn_ptr = self.ctx.memory_wait32(memory_index, self.intrinsics);
+                let callable_func = inkwell::values::CallableValue::try_from(wait32_fn_ptr).unwrap();
+                let ret = self.builder.build_call(
+                    callable_func,
+                    &[
+                        vmctx.as_basic_value_enum().into(),
+                        self.intrinsics.i32_ty.const_int(memarg.memory as u64, false).into(),
+                        dst.into(),
+                        val.into(),
+                        timeout.into(),
+                    ],
+                    "",
+                );
+                self.state.push1(ret.try_as_basic_value().left().unwrap());
+            }
+            Operator::MemoryAtomicWait64 { memarg } => {
+                let memory_index = MemoryIndex::from_u32(memarg.memory);
+                let (dst, val, timeout) = self.state.pop3()?;
+                let wait64_fn_ptr = self.ctx.memory_wait64(memory_index, self.intrinsics);
+                let callable_func = inkwell::values::CallableValue::try_from(wait64_fn_ptr).unwrap();
+                let ret = self.builder.build_call(
+                    callable_func,
+                    &[
+                        vmctx.as_basic_value_enum().into(),
+                        self.intrinsics.i32_ty.const_int(memarg.memory as u64, false).into(),
+                        dst.into(),
+                        val.into(),
+                        timeout.into(),
+                    ],
+                    "",
+                );
+                self.state.push1(ret.try_as_basic_value().left().unwrap());
+            }
+            Operator::MemoryAtomicNotify { memarg } => {
+                let memory_index = MemoryIndex::from_u32(memarg.memory);
+                let (dst, count) = self.state.pop2()?;
+                let notify_fn_ptr = self.ctx.memory_notify(memory_index, self.intrinsics);
+                let callable_func = inkwell::values::CallableValue::try_from(notify_fn_ptr).unwrap();
+                let cnt = self.builder.build_call(
+                    callable_func,
+                    &[
+                        vmctx.as_basic_value_enum().into(),
+                        self.intrinsics.i32_ty.const_int(memarg.memory as u64, false).into(),
+                        dst.into(),
+                        count.into(),
+                    ],
+                    "",
+                );
+                self.state.push1(cnt.try_as_basic_value().left().unwrap());
+            }
             _ => {
                 return Err(CompileError::Codegen(format!(
                     "Operator {:?} unimplemented",
diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs
index 7a1d1ebb9ba..d20b4c07bce 100644
--- a/lib/compiler-llvm/src/translator/intrinsics.rs
+++ b/lib/compiler-llvm/src/translator/intrinsics.rs
@@ -240,6 +240,12 @@ pub struct Intrinsics<'ctx> {
     pub imported_memory_copy: FunctionValue<'ctx>,
     pub memory_fill: FunctionValue<'ctx>,
     pub imported_memory_fill: FunctionValue<'ctx>,
+    pub memory_wait32: FunctionValue<'ctx>,
+    pub imported_memory_wait32: FunctionValue<'ctx>,
+    pub memory_wait64: FunctionValue<'ctx>,
+    pub imported_memory_wait64: FunctionValue<'ctx>,
+    pub memory_notify: FunctionValue<'ctx>,
+    pub imported_memory_notify: FunctionValue<'ctx>,
 
     pub throw_trap: FunctionValue<'ctx>,
 
@@ -256,6 +262,12 @@ pub struct Intrinsics<'ctx> {
     pub imported_memory32_grow_ptr_ty: PointerType<'ctx>,
     pub memory32_size_ptr_ty: PointerType<'ctx>,
     pub imported_memory32_size_ptr_ty: PointerType<'ctx>,
+    pub memory32_wait32_ptr_ty: PointerType<'ctx>,
+    pub imported_memory32_wait32_ptr_ty: PointerType<'ctx>,
+    pub memory32_wait64_ptr_ty: PointerType<'ctx>,
+    pub imported_memory32_wait64_ptr_ty: PointerType<'ctx>,
+    pub memory32_notify_ptr_ty: PointerType<'ctx>,
+    pub imported_memory32_notify_ptr_ty: PointerType<'ctx>,
 
     // Pointer to the VM.
     pub ctx_ptr_ty: PointerType<'ctx>,
@@ -1007,6 +1019,86 @@ impl<'ctx> Intrinsics<'ctx> {
                 void_ty.fn_type(&[i32_ty_basic_md], false),
                 None,
             ),
+            memory_wait32: module.add_function(
+                "wasmer_vm_memory32_atomic_wait32",
+                i32_ty.fn_type(
+                    &[
+                        ctx_ptr_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                        i64_ty_basic_md,
+                    ],
+                    false,
+                ),
+                None,
+            ),
+            imported_memory_wait32: module.add_function(
+                "wasmer_vm_imported_memory32_atomic_wait32",
+                i32_ty.fn_type(
+                    &[
+                        ctx_ptr_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                        i64_ty_basic_md,
+                    ],
+                    false,
+                ),
+                None,
+            ),
+            memory_wait64: module.add_function(
+                "wasmer_vm_memory32_atomic_wait64",
+                i32_ty.fn_type(
+                    &[
+                        ctx_ptr_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                        i64_ty_basic_md,
+                        i64_ty_basic_md,
+                    ],
+                    false,
+                ),
+                None,
+            ),
+            imported_memory_wait64: module.add_function(
+                "wasmer_vm_imported_memory32_atomic_wait64",
+                i32_ty.fn_type(
+                    &[
+                        ctx_ptr_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                        i64_ty_basic_md,
+                        i64_ty_basic_md,
+                    ],
+                    false,
+                ),
+                None,
+            ),
+            memory_notify: module.add_function(
+                "wasmer_vm_memory32_atomic_notify",
+                i32_ty.fn_type(
+                    &[
+                        ctx_ptr_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                    ],
+                    false,
+                ),
+                None,
+            ),
+            imported_memory_notify: module.add_function(
+                "wasmer_vm_imported_memory32_atomic_notify",
+                i32_ty.fn_type(
+                    &[
+                        ctx_ptr_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                    ],
+                    false,
+                ),
+                None,
+            ),
 
             vmfunction_import_ptr_ty: context
                 .struct_type(&[i8_ptr_ty_basic, i8_ptr_ty_basic], false)
@@ -1038,6 +1130,24 @@ impl<'ctx> Intrinsics<'ctx> {
             imported_memory32_size_ptr_ty: i32_ty
                 .fn_type(&[ctx_ptr_ty_basic_md, i32_ty_basic_md], false)
                 .ptr_type(AddressSpace::Generic),
+            memory32_wait32_ptr_ty: i32_ty
+                .fn_type(&[ctx_ptr_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i64_ty_basic_md], false)
+                .ptr_type(AddressSpace::Generic),
+            imported_memory32_wait32_ptr_ty: i32_ty
+                .fn_type(&[ctx_ptr_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i64_ty_basic_md], false)
+                .ptr_type(AddressSpace::Generic),
+            memory32_wait64_ptr_ty: i32_ty
+                .fn_type(&[ctx_ptr_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i64_ty_basic_md, i64_ty_basic_md], false)
+                .ptr_type(AddressSpace::Generic),
+            imported_memory32_wait64_ptr_ty: i32_ty
+                .fn_type(&[ctx_ptr_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i64_ty_basic_md, i64_ty_basic_md], false)
+                .ptr_type(AddressSpace::Generic),
+            memory32_notify_ptr_ty: i32_ty
+                .fn_type(&[ctx_ptr_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md], false)
+                .ptr_type(AddressSpace::Generic),
+            imported_memory32_notify_ptr_ty: i32_ty
+                .fn_type(&[ctx_ptr_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md], false)
+                .ptr_type(AddressSpace::Generic),
 
             ctx_ptr_ty,
         };
@@ -1658,6 +1768,132 @@ impl<'ctx, 'a> CtxType<'ctx, 'a> {
         })
     }
 
+    pub fn memory_wait32(
+        &mut self,
+        memory_index: MemoryIndex,
+        intrinsics: &Intrinsics<'ctx>,
+    ) -> PointerValue<'ctx> {
+        let (cached_memory_size, wasm_module, offsets, cache_builder, ctx_ptr_value) = (
+            &mut self.cached_memory_size,
+            &self.wasm_module,
+            &self.offsets,
+            &self.cache_builder,
+            &self.ctx_ptr_value,
+        );
+        *cached_memory_size.entry(memory_index).or_insert_with(|| {
+            let (size_fn, size_fn_ty) = if wasm_module.local_memory_index(memory_index).is_some() {
+                (
+                    VMBuiltinFunctionIndex::get_memory_atomic_wait32_index(),
+                    intrinsics.memory32_wait32_ptr_ty,
+                )
+            } else {
+                (
+                    VMBuiltinFunctionIndex::get_imported_memory_atomic_wait32_index(),
+                    intrinsics.imported_memory32_wait32_ptr_ty,
+                )
+            };
+            let offset = offsets.vmctx_builtin_function(size_fn);
+            let offset = intrinsics.i32_ty.const_int(offset.into(), false);
+            let size_fn_ptr_ptr = unsafe { cache_builder.build_gep(*ctx_ptr_value, &[offset], "") };
+
+            let size_fn_ptr_ptr = cache_builder
+                .build_bitcast(
+                    size_fn_ptr_ptr,
+                    size_fn_ty.ptr_type(AddressSpace::Generic),
+                    "",
+                )
+                .into_pointer_value();
+
+            cache_builder
+                .build_load(size_fn_ptr_ptr, "")
+                .into_pointer_value()
+        })
+    }
+    
+    pub fn memory_wait64(
+        &mut self,
+        memory_index: MemoryIndex,
+        intrinsics: &Intrinsics<'ctx>,
+    ) -> PointerValue<'ctx> {
+        let (cached_memory_size, wasm_module, offsets, cache_builder, ctx_ptr_value) = (
+            &mut self.cached_memory_size,
+            &self.wasm_module,
+            &self.offsets,
+            &self.cache_builder,
+            &self.ctx_ptr_value,
+        );
+        *cached_memory_size.entry(memory_index).or_insert_with(|| {
+            let (size_fn, size_fn_ty) = if wasm_module.local_memory_index(memory_index).is_some() {
+                (
+                    VMBuiltinFunctionIndex::get_memory_atomic_wait64_index(),
+                    intrinsics.memory32_wait64_ptr_ty,
+                )
+            } else {
+                (
+                    VMBuiltinFunctionIndex::get_imported_memory_atomic_wait64_index(),
+                    intrinsics.imported_memory32_wait64_ptr_ty,
+                )
+            };
+            let offset = offsets.vmctx_builtin_function(size_fn);
+            let offset = intrinsics.i32_ty.const_int(offset.into(), false);
+            let size_fn_ptr_ptr = unsafe { cache_builder.build_gep(*ctx_ptr_value, &[offset], "") };
+
+            let size_fn_ptr_ptr = cache_builder
+                .build_bitcast(
+                    size_fn_ptr_ptr,
+                    size_fn_ty.ptr_type(AddressSpace::Generic),
+                    "",
+                )
+                .into_pointer_value();
+
+            cache_builder
+                .build_load(size_fn_ptr_ptr, "")
+                .into_pointer_value()
+        })
+    }
+    
+    pub fn memory_notify(
+        &mut self,
+        memory_index: MemoryIndex,
+        intrinsics: &Intrinsics<'ctx>,
+    ) -> PointerValue<'ctx> {
+        let (cached_memory_size, wasm_module, offsets, cache_builder, ctx_ptr_value) = (
+            &mut self.cached_memory_size,
+            &self.wasm_module,
+            &self.offsets,
+            &self.cache_builder,
+            &self.ctx_ptr_value,
+        );
+        *cached_memory_size.entry(memory_index).or_insert_with(|| {
+            let (size_fn, size_fn_ty) = if wasm_module.local_memory_index(memory_index).is_some() {
+                (
+                    VMBuiltinFunctionIndex::get_memory_atomic_notify_index(),
+                    intrinsics.memory32_notify_ptr_ty,
+                )
+            } else {
+                (
+                    VMBuiltinFunctionIndex::get_imported_memory_atomic_notify_index(),
+                    intrinsics.imported_memory32_notify_ptr_ty,
+                )
+            };
+            let offset = offsets.vmctx_builtin_function(size_fn);
+            let offset = intrinsics.i32_ty.const_int(offset.into(), false);
+            let size_fn_ptr_ptr = unsafe { cache_builder.build_gep(*ctx_ptr_value, &[offset], "") };
+
+            let size_fn_ptr_ptr = cache_builder
+                .build_bitcast(
+                    size_fn_ptr_ptr,
+                    size_fn_ty.ptr_type(AddressSpace::Generic),
+                    "",
+                )
+                .into_pointer_value();
+
+            cache_builder
+                .build_load(size_fn_ptr_ptr, "")
+                .into_pointer_value()
+        })
+    }
+    
     pub fn get_offsets(&self) -> &VMOffsets {
         &self.offsets
     }

From 74721a8b25d7cbd82d87836d326df26a5f734cbb Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Wed, 7 Sep 2022 15:21:34 +0200
Subject: [PATCH 12/24] Fixed linter

---
 .../src/translator/intrinsics.rs              | 82 ++++++++++++++-----
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/lib/compiler-llvm/src/translator/intrinsics.rs b/lib/compiler-llvm/src/translator/intrinsics.rs
index d20b4c07bce..028b0a37aab 100644
--- a/lib/compiler-llvm/src/translator/intrinsics.rs
+++ b/lib/compiler-llvm/src/translator/intrinsics.rs
@@ -1078,11 +1078,7 @@ impl<'ctx> Intrinsics<'ctx> {
             memory_notify: module.add_function(
                 "wasmer_vm_memory32_atomic_notify",
                 i32_ty.fn_type(
-                    &[
-                        ctx_ptr_ty_basic_md,
-                        i32_ty_basic_md,
-                        i32_ty_basic_md,
-                    ],
+                    &[ctx_ptr_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md],
                     false,
                 ),
                 None,
@@ -1090,11 +1086,7 @@ impl<'ctx> Intrinsics<'ctx> {
             imported_memory_notify: module.add_function(
                 "wasmer_vm_imported_memory32_atomic_notify",
                 i32_ty.fn_type(
-                    &[
-                        ctx_ptr_ty_basic_md,
-                        i32_ty_basic_md,
-                        i32_ty_basic_md,
-                    ],
+                    &[ctx_ptr_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md],
                     false,
                 ),
                 None,
@@ -1131,22 +1123,74 @@ impl<'ctx> Intrinsics<'ctx> {
                 .fn_type(&[ctx_ptr_ty_basic_md, i32_ty_basic_md], false)
                 .ptr_type(AddressSpace::Generic),
             memory32_wait32_ptr_ty: i32_ty
-                .fn_type(&[ctx_ptr_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i64_ty_basic_md], false)
+                .fn_type(
+                    &[
+                        ctx_ptr_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                        i64_ty_basic_md,
+                    ],
+                    false,
+                )
                 .ptr_type(AddressSpace::Generic),
             imported_memory32_wait32_ptr_ty: i32_ty
-                .fn_type(&[ctx_ptr_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i64_ty_basic_md], false)
+                .fn_type(
+                    &[
+                        ctx_ptr_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                        i64_ty_basic_md,
+                    ],
+                    false,
+                )
                 .ptr_type(AddressSpace::Generic),
             memory32_wait64_ptr_ty: i32_ty
-                .fn_type(&[ctx_ptr_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i64_ty_basic_md, i64_ty_basic_md], false)
+                .fn_type(
+                    &[
+                        ctx_ptr_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                        i64_ty_basic_md,
+                        i64_ty_basic_md,
+                    ],
+                    false,
+                )
                 .ptr_type(AddressSpace::Generic),
             imported_memory32_wait64_ptr_ty: i32_ty
-                .fn_type(&[ctx_ptr_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i64_ty_basic_md, i64_ty_basic_md], false)
+                .fn_type(
+                    &[
+                        ctx_ptr_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                        i64_ty_basic_md,
+                        i64_ty_basic_md,
+                    ],
+                    false,
+                )
                 .ptr_type(AddressSpace::Generic),
             memory32_notify_ptr_ty: i32_ty
-                .fn_type(&[ctx_ptr_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md], false)
+                .fn_type(
+                    &[
+                        ctx_ptr_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                    ],
+                    false,
+                )
                 .ptr_type(AddressSpace::Generic),
             imported_memory32_notify_ptr_ty: i32_ty
-                .fn_type(&[ctx_ptr_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md, i32_ty_basic_md], false)
+                .fn_type(
+                    &[
+                        ctx_ptr_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                        i32_ty_basic_md,
+                    ],
+                    false,
+                )
                 .ptr_type(AddressSpace::Generic),
 
             ctx_ptr_ty,
@@ -1809,7 +1853,7 @@ impl<'ctx, 'a> CtxType<'ctx, 'a> {
                 .into_pointer_value()
         })
     }
-    
+
     pub fn memory_wait64(
         &mut self,
         memory_index: MemoryIndex,
@@ -1851,7 +1895,7 @@ impl<'ctx, 'a> CtxType<'ctx, 'a> {
                 .into_pointer_value()
         })
     }
-    
+
     pub fn memory_notify(
         &mut self,
         memory_index: MemoryIndex,
@@ -1893,7 +1937,7 @@ impl<'ctx, 'a> CtxType<'ctx, 'a> {
                 .into_pointer_value()
         })
     }
-    
+
     pub fn get_offsets(&self) -> &VMOffsets {
         &self.offsets
     }

From 12ce30929280221d80857687c532f13861ddf19e Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Wed, 7 Sep 2022 15:26:22 +0200
Subject: [PATCH 13/24] Fixed Alignment check for Atomic access in LLVM
 compiler (for #3163)

---
 lib/compiler-llvm/src/translator/code.rs | 152 +++++++++++++----------
 tests/ignores.txt                        |   1 -
 2 files changed, 83 insertions(+), 70 deletions(-)

diff --git a/lib/compiler-llvm/src/translator/code.rs b/lib/compiler-llvm/src/translator/code.rs
index 8dc74b0a57b..8bdd065532c 100644
--- a/lib/compiler-llvm/src/translator/code.rs
+++ b/lib/compiler-llvm/src/translator/code.rs
@@ -1174,8 +1174,10 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
             .into_pointer_value())
     }
 
-    fn trap_if_misaligned(&self, memarg: &MemoryImmediate, ptr: PointerValue<'ctx>) {
-        let align = memarg.align;
+    fn trap_if_misaligned(&self, _memarg: &MemoryImmediate, ptr: PointerValue<'ctx>, align: u8) {
+        if align <= 1 {
+            return;
+        }
         let value = self
             .builder
             .build_ptr_to_int(ptr, self.intrinsics.i64_ty, "");
@@ -8962,7 +8964,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let result = self.builder.build_load(effective_address, "");
                 let load = result.as_instruction_value().unwrap();
                 self.annotate_user_memaccess(memory_index, memarg, 4, load)?;
@@ -8980,7 +8982,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     8,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 8);
                 let result = self.builder.build_load(effective_address, "");
                 let load = result.as_instruction_value().unwrap();
                 self.annotate_user_memaccess(memory_index, memarg, 8, load)?;
@@ -8998,7 +9000,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_result = self
                     .builder
                     .build_load(effective_address, "")
@@ -9022,7 +9024,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_result = self
                     .builder
                     .build_load(effective_address, "")
@@ -9046,7 +9048,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_result = self
                     .builder
                     .build_load(effective_address, "")
@@ -9070,7 +9072,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_result = self
                     .builder
                     .build_load(effective_address, "")
@@ -9094,7 +9096,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let narrow_result = self
                     .builder
                     .build_load(effective_address, "")
@@ -9119,7 +9121,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let store = self.builder.build_store(effective_address, value);
                 self.annotate_user_memaccess(memory_index, memarg, 4, store)?;
                 store
@@ -9137,7 +9139,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     8,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 8);
                 let store = self.builder.build_store(effective_address, value);
                 self.annotate_user_memaccess(memory_index, memarg, 8, store)?;
                 store
@@ -9155,7 +9157,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i8_ty, "");
@@ -9177,7 +9179,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i16_ty, "");
@@ -9198,7 +9200,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i32_ty, "");
@@ -9219,7 +9221,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i8_ty, "");
@@ -9254,7 +9256,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i16_ty, "");
@@ -9289,7 +9291,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let old = self
                     .builder
                     .build_atomicrmw(
@@ -9318,7 +9320,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i8_ty, "");
@@ -9353,7 +9355,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i16_ty, "");
@@ -9388,7 +9390,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i32_ty, "");
@@ -9423,7 +9425,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     8,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 8);
                 let old = self
                     .builder
                     .build_atomicrmw(
@@ -9452,7 +9454,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i8_ty, "");
@@ -9487,7 +9489,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i16_ty, "");
@@ -9522,7 +9524,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let old = self
                     .builder
                     .build_atomicrmw(
@@ -9551,7 +9553,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i8_ty, "");
@@ -9586,7 +9588,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i16_ty, "");
@@ -9621,7 +9623,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i32_ty, "");
@@ -9656,7 +9658,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     8,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 8);
                 let old = self
                     .builder
                     .build_atomicrmw(
@@ -9685,7 +9687,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i8_ty, "");
@@ -9720,7 +9722,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i16_ty, "");
@@ -9755,7 +9757,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let old = self
                     .builder
                     .build_atomicrmw(
@@ -9784,7 +9786,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i8_ty, "");
@@ -9819,7 +9821,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i16_ty, "");
@@ -9854,7 +9856,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i32_ty, "");
@@ -9889,7 +9891,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     8,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 8);
                 let old = self
                     .builder
                     .build_atomicrmw(
@@ -9918,7 +9920,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i8_ty, "");
@@ -9953,7 +9955,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i16_ty, "");
@@ -9988,7 +9990,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let old = self
                     .builder
                     .build_atomicrmw(
@@ -10020,7 +10022,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i8_ty, "");
@@ -10055,7 +10057,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i16_ty, "");
@@ -10090,7 +10092,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i32_ty, "");
@@ -10125,7 +10127,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     8,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 8);
                 let old = self
                     .builder
                     .build_atomicrmw(
@@ -10154,7 +10156,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i8_ty, "");
@@ -10189,7 +10191,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i16_ty, "");
@@ -10224,7 +10226,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let old = self
                     .builder
                     .build_atomicrmw(
@@ -10253,7 +10255,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i8_ty, "");
@@ -10288,7 +10290,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i16_ty, "");
@@ -10323,7 +10325,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i32_ty, "");
@@ -10358,7 +10360,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     8,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 8);
                 let old = self
                     .builder
                     .build_atomicrmw(
@@ -10387,7 +10389,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i8_ty, "");
@@ -10422,7 +10424,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i16_ty, "");
@@ -10457,7 +10459,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let old = self
                     .builder
                     .build_atomicrmw(
@@ -10486,7 +10488,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i8_ty, "");
@@ -10521,7 +10523,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i16_ty, "");
@@ -10556,7 +10558,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let narrow_value =
                     self.builder
                         .build_int_truncate(value, self.intrinsics.i32_ty, "");
@@ -10591,7 +10593,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     8,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 8);
                 let old = self
                     .builder
                     .build_atomicrmw(
@@ -10623,7 +10625,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_cmp = self
                     .builder
                     .build_int_truncate(cmp, self.intrinsics.i8_ty, "");
@@ -10670,7 +10672,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_cmp = self
                     .builder
                     .build_int_truncate(cmp, self.intrinsics.i16_ty, "");
@@ -10717,7 +10719,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let old = self
                     .builder
                     .build_cmpxchg(
@@ -10751,7 +10753,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     1,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 1);
                 let narrow_cmp = self
                     .builder
                     .build_int_truncate(cmp, self.intrinsics.i8_ty, "");
@@ -10798,7 +10800,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     2,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 2);
                 let narrow_cmp = self
                     .builder
                     .build_int_truncate(cmp, self.intrinsics.i16_ty, "");
@@ -10845,7 +10847,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     4,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 4);
                 let narrow_cmp = self
                     .builder
                     .build_int_truncate(cmp, self.intrinsics.i32_ty, "");
@@ -10892,7 +10894,7 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                     offset,
                     8,
                 )?;
-                self.trap_if_misaligned(memarg, effective_address);
+                self.trap_if_misaligned(memarg, effective_address, 8);
                 let old = self
                     .builder
                     .build_cmpxchg(
@@ -11235,12 +11237,16 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                 let memory_index = MemoryIndex::from_u32(memarg.memory);
                 let (dst, val, timeout) = self.state.pop3()?;
                 let wait32_fn_ptr = self.ctx.memory_wait32(memory_index, self.intrinsics);
-                let callable_func = inkwell::values::CallableValue::try_from(wait32_fn_ptr).unwrap();
+                let callable_func =
+                    inkwell::values::CallableValue::try_from(wait32_fn_ptr).unwrap();
                 let ret = self.builder.build_call(
                     callable_func,
                     &[
                         vmctx.as_basic_value_enum().into(),
-                        self.intrinsics.i32_ty.const_int(memarg.memory as u64, false).into(),
+                        self.intrinsics
+                            .i32_ty
+                            .const_int(memarg.memory as u64, false)
+                            .into(),
                         dst.into(),
                         val.into(),
                         timeout.into(),
@@ -11253,12 +11259,16 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                 let memory_index = MemoryIndex::from_u32(memarg.memory);
                 let (dst, val, timeout) = self.state.pop3()?;
                 let wait64_fn_ptr = self.ctx.memory_wait64(memory_index, self.intrinsics);
-                let callable_func = inkwell::values::CallableValue::try_from(wait64_fn_ptr).unwrap();
+                let callable_func =
+                    inkwell::values::CallableValue::try_from(wait64_fn_ptr).unwrap();
                 let ret = self.builder.build_call(
                     callable_func,
                     &[
                         vmctx.as_basic_value_enum().into(),
-                        self.intrinsics.i32_ty.const_int(memarg.memory as u64, false).into(),
+                        self.intrinsics
+                            .i32_ty
+                            .const_int(memarg.memory as u64, false)
+                            .into(),
                         dst.into(),
                         val.into(),
                         timeout.into(),
@@ -11271,12 +11281,16 @@ impl<'ctx, 'a> LLVMFunctionCodeGenerator<'ctx, 'a> {
                 let memory_index = MemoryIndex::from_u32(memarg.memory);
                 let (dst, count) = self.state.pop2()?;
                 let notify_fn_ptr = self.ctx.memory_notify(memory_index, self.intrinsics);
-                let callable_func = inkwell::values::CallableValue::try_from(notify_fn_ptr).unwrap();
+                let callable_func =
+                    inkwell::values::CallableValue::try_from(notify_fn_ptr).unwrap();
                 let cnt = self.builder.build_call(
                     callable_func,
                     &[
                         vmctx.as_basic_value_enum().into(),
-                        self.intrinsics.i32_ty.const_int(memarg.memory as u64, false).into(),
+                        self.intrinsics
+                            .i32_ty
+                            .const_int(memarg.memory as u64, false)
+                            .into(),
                         dst.into(),
                         count.into(),
                     ],
diff --git a/tests/ignores.txt b/tests/ignores.txt
index 96b081b7b04..08a5982bdc0 100644
--- a/tests/ignores.txt
+++ b/tests/ignores.txt
@@ -27,7 +27,6 @@ cranelift+aarch64+macos    traps::start_trap_pretty
 singlepass+aarch64 spec::threads::atomic
 singlepass spec::threads::imports
 cranelift spec::threads::imports
-llvm spec::threads::atomic
 llvm spec::threads::imports
 
 # Also neither LLVM nor Cranelift currently implement stack probing on AArch64.

From 8a0bd4b33189eb2c109c4e15020d75ea7dc8af57 Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Fri, 9 Sep 2022 18:24:15 +0200
Subject: [PATCH 14/24] Added some aarch64 atomic access emitter (not finished)

Removed public Imports::import_shared_memory and try to auto-initialize shared memory directly

Fixed clippy

Added Atomic Add/Sub/And/Or/Xor operator to Singlepass/AArch64 backend

Added atomic_xchg support for Singlepass/AArch64 backend

Finished all atomic access operator for Singlepass/Aarch64 backend
---
 lib/api/src/js/imports.rs                    |    6 +-
 lib/api/src/sys/imports.rs                   |   38 +-
 lib/api/src/sys/instance.rs                  |    2 +-
 lib/cli/src/commands/run/wasi.rs             |    3 +-
 lib/compiler-singlepass/src/emitter_arm64.rs |  124 +
 lib/compiler-singlepass/src/machine_arm64.rs | 3587 ++++++++++++++----
 lib/vm/src/lib.rs                            |    4 +-
 tests/ignores.txt                            |    1 -
 8 files changed, 3110 insertions(+), 655 deletions(-)

diff --git a/lib/api/src/js/imports.rs b/lib/api/src/js/imports.rs
index a668b96d5fd..fb82e9885c1 100644
--- a/lib/api/src/js/imports.rs
+++ b/lib/api/src/js/imports.rs
@@ -133,7 +133,11 @@ impl Imports {
     /// Resolve and return a vector of imports in the order they are defined in the `module`'s source code.
     ///
     /// This means the returned `Vec<Extern>` might be a subset of the imports contained in `self`.
-    pub fn imports_for_module(&self, module: &Module) -> Result<Vec<Extern>, LinkError> {
+    pub fn imports_for_module(
+        &self,
+        module: &Module,
+        _store: &mut impl AsStoreMut,
+    ) -> Result<Vec<Extern>, LinkError> {
         let mut ret = vec![];
         for import in module.imports() {
             if let Some(imp) = self
diff --git a/lib/api/src/sys/imports.rs b/lib/api/src/sys/imports.rs
index b6864e5ee2f..18eef148f2c 100644
--- a/lib/api/src/sys/imports.rs
+++ b/lib/api/src/sys/imports.rs
@@ -114,11 +114,11 @@ impl Imports {
 
     /// Imports (any) shared memory into the imports.
     /// (if the module does not import memory then this function is ignored)
-    pub fn import_shared_memory(
-        &mut self,
+    pub(crate) fn import_shared_memory(
+        &self,
         module: &Module,
         store: &mut impl AsStoreMut,
-    ) -> Option<VMSharedMemory> {
+    ) -> Self {
         // Determine if shared memory needs to be created and imported
         let shared_memory = module
             .imports()
@@ -130,16 +130,21 @@ impl Imports {
                 VMSharedMemory::new(&ty, &style).unwrap()
             });
 
+        let mut ret = self.clone();
         if let Some(memory) = shared_memory {
-            self.define(
-                "env",
-                "memory",
-                Memory::new_from_existing(store, memory.clone().into()),
-            );
-            Some(memory)
-        } else {
-            None
-        }
+            // if the memory has already be defined, don't redefine it!
+            if !self
+                .map
+                .contains_key(&("env".to_string(), "memory".to_string()))
+            {
+                ret.define(
+                    "env",
+                    "memory",
+                    Memory::new_from_existing(store, memory.into()),
+                );
+            }
+        };
+        ret
     }
 
     /// Returns the contents of a namespace as an `Exports`.
@@ -162,10 +167,15 @@ impl Imports {
     /// Resolve and return a vector of imports in the order they are defined in the `module`'s source code.
     ///
     /// This means the returned `Vec<Extern>` might be a subset of the imports contained in `self`.
-    pub fn imports_for_module(&self, module: &Module) -> Result<Vec<Extern>, LinkError> {
+    pub fn imports_for_module(
+        &self,
+        module: &Module,
+        store: &mut impl AsStoreMut,
+    ) -> Result<Vec<Extern>, LinkError> {
         let mut ret = vec![];
+        let imports = self.import_shared_memory(module, store);
         for import in module.imports() {
-            if let Some(imp) = self
+            if let Some(imp) = imports
                 .map
                 .get(&(import.module().to_string(), import.name().to_string()))
             {
diff --git a/lib/api/src/sys/instance.rs b/lib/api/src/sys/instance.rs
index ab8e9d5c293..315191128e6 100644
--- a/lib/api/src/sys/instance.rs
+++ b/lib/api/src/sys/instance.rs
@@ -116,7 +116,7 @@ impl Instance {
         imports: &Imports,
     ) -> Result<Self, InstantiationError> {
         let imports = imports
-            .imports_for_module(module)
+            .imports_for_module(module, store)
             .map_err(InstantiationError::Link)?;
         let mut handle = module.instantiate(store, &imports)?;
         let exports = module
diff --git a/lib/cli/src/commands/run/wasi.rs b/lib/cli/src/commands/run/wasi.rs
index d913e5119c7..ffc70c42036 100644
--- a/lib/cli/src/commands/run/wasi.rs
+++ b/lib/cli/src/commands/run/wasi.rs
@@ -104,8 +104,7 @@ impl Wasi {
             is_wasix_module(module),
             std::sync::atomic::Ordering::Release,
         );
-        let mut import_object = import_object_for_all_wasi_versions(store, &wasi_env.env);
-        import_object.import_shared_memory(module, store);
+        let import_object = import_object_for_all_wasi_versions(store, &wasi_env.env);
         let instance = Instance::new(store, module, &import_object)?;
         let memory = instance.exports.get_memory("memory")?;
         wasi_env.data_mut(store).set_memory(memory.clone());
diff --git a/lib/compiler-singlepass/src/emitter_arm64.rs b/lib/compiler-singlepass/src/emitter_arm64.rs
index bdf010a38b2..074ae0acd00 100644
--- a/lib/compiler-singlepass/src/emitter_arm64.rs
+++ b/lib/compiler-singlepass/src/emitter_arm64.rs
@@ -153,6 +153,31 @@ pub trait EmitterARM64 {
     fn emit_strb(&mut self, sz: Size, reg: Location, dst: Location) -> Result<(), CompileError>;
     fn emit_strh(&mut self, sz: Size, reg: Location, dst: Location) -> Result<(), CompileError>;
 
+    fn emit_ldaxr(&mut self, sz: Size, reg: Location, dst: Location) -> Result<(), CompileError>;
+    fn emit_ldaxrb(&mut self, sz: Size, reg: Location, dst: Location) -> Result<(), CompileError>;
+    fn emit_ldaxrh(&mut self, sz: Size, reg: Location, dst: Location) -> Result<(), CompileError>;
+    fn emit_stlxr(
+        &mut self,
+        sz: Size,
+        status: Location,
+        reg: Location,
+        dst: Location,
+    ) -> Result<(), CompileError>;
+    fn emit_stlxrb(
+        &mut self,
+        sz: Size,
+        status: Location,
+        reg: Location,
+        dst: Location,
+    ) -> Result<(), CompileError>;
+    fn emit_stlxrh(
+        &mut self,
+        sz: Size,
+        status: Location,
+        reg: Location,
+        dst: Location,
+    ) -> Result<(), CompileError>;
+
     fn emit_mov(&mut self, sz: Size, src: Location, dst: Location) -> Result<(), CompileError>;
 
     fn emit_movn(&mut self, sz: Size, reg: Location, val: u32) -> Result<(), CompileError>;
@@ -1059,6 +1084,105 @@ impl EmitterARM64 for Assembler {
         Ok(())
     }
 
+    fn emit_ldaxr(&mut self, sz: Size, reg: Location, dst: Location) -> Result<(), CompileError> {
+        match (sz, reg, dst) {
+            (Size::S32, Location::GPR(reg), Location::GPR(dst)) => {
+                let reg = reg.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; ldaxr W(reg), [X(dst)]);
+            }
+            (Size::S64, Location::GPR(reg), Location::GPR(dst)) => {
+                let reg = reg.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; ldaxr X(reg), [X(dst)]);
+            }
+            _ => codegen_error!("singlepass can't emit LDAXR {:?}, {:?}", reg, dst),
+        }
+        Ok(())
+    }
+    fn emit_ldaxrb(&mut self, _sz: Size, reg: Location, dst: Location) -> Result<(), CompileError> {
+        match (reg, dst) {
+            (Location::GPR(reg), Location::GPR(dst)) => {
+                let reg = reg.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; ldaxrb W(reg), [X(dst)]);
+            }
+            _ => codegen_error!("singlepass can't emit LDAXRB {:?}, {:?}", reg, dst),
+        }
+        Ok(())
+    }
+    fn emit_ldaxrh(&mut self, _sz: Size, reg: Location, dst: Location) -> Result<(), CompileError> {
+        match (reg, dst) {
+            (Location::GPR(reg), Location::GPR(dst)) => {
+                let reg = reg.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; ldaxrh W(reg), [X(dst)]);
+            }
+            _ => codegen_error!("singlepass can't emit LDAXRH {:?}, {:?}", reg, dst),
+        }
+        Ok(())
+    }
+    fn emit_stlxr(
+        &mut self,
+        sz: Size,
+        status: Location,
+        reg: Location,
+        dst: Location,
+    ) -> Result<(), CompileError> {
+        match (sz, status, reg, dst) {
+            (Size::S32, Location::GPR(status), Location::GPR(reg), Location::GPR(dst)) => {
+                let reg = reg.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                let status = status.into_index() as u32;
+                dynasm!(self ; stlxr W(status), W(reg), [X(dst)]);
+            }
+            (Size::S64, Location::GPR(status), Location::GPR(reg), Location::GPR(dst)) => {
+                let reg = reg.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                let status = status.into_index() as u32;
+                dynasm!(self ; stlxr W(status), X(reg), [X(dst)]);
+            }
+            _ => codegen_error!("singlepass can't emit STLXR {:?}, {:?}", reg, dst),
+        }
+        Ok(())
+    }
+    fn emit_stlxrb(
+        &mut self,
+        _sz: Size,
+        status: Location,
+        reg: Location,
+        dst: Location,
+    ) -> Result<(), CompileError> {
+        match (status, reg, dst) {
+            (Location::GPR(status), Location::GPR(reg), Location::GPR(dst)) => {
+                let reg = reg.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                let status = status.into_index() as u32;
+                dynasm!(self ; stlxrb W(status), W(reg), [X(dst)]);
+            }
+            _ => codegen_error!("singlepass can't emit STLXRB {:?}, {:?}", reg, dst),
+        }
+        Ok(())
+    }
+    fn emit_stlxrh(
+        &mut self,
+        _sz: Size,
+        status: Location,
+        reg: Location,
+        dst: Location,
+    ) -> Result<(), CompileError> {
+        match (status, reg, dst) {
+            (Location::GPR(status), Location::GPR(reg), Location::GPR(dst)) => {
+                let reg = reg.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                let status = status.into_index() as u32;
+                dynasm!(self ; stlxrh W(status), W(reg), [X(dst)]);
+            }
+            _ => codegen_error!("singlepass can't emit STLXRH {:?}, {:?}", reg, dst),
+        }
+        Ok(())
+    }
+
     fn emit_mov(&mut self, sz: Size, src: Location, dst: Location) -> Result<(), CompileError> {
         match (sz, src, dst) {
             (Size::S64, Location::GPR(src), Location::GPR(dst)) => {
diff --git a/lib/compiler-singlepass/src/machine_arm64.rs b/lib/compiler-singlepass/src/machine_arm64.rs
index 806bb33eb1e..1f0c5b95ff2 100644
--- a/lib/compiler-singlepass/src/machine_arm64.rs
+++ b/lib/compiler-singlepass/src/machine_arm64.rs
@@ -1096,11 +1096,11 @@ impl MachineARM64 {
         self.release_gpr(tmp_bound);
         self.release_gpr(tmp_base);
 
-        let align = memarg.align;
+        let align = value_size as u32;
         if check_alignment && align != 1 {
             self.assembler.emit_tst(
                 Size::S64,
-                Location::Imm32((align - 1).into()),
+                Location::Imm32(align - 1),
                 Location::GPR(tmp_addr),
             )?;
             self.assembler
@@ -3290,42 +3290,75 @@ impl Machine for MachineARM64 {
     }
     fn i32_atomic_load(
         &mut self,
-        _addr: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_load unimplemented");
+        self.memory_op(
+            addr,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| this.emit_relaxed_ldr32(Size::S32, ret, Location::Memory(addr, 0)),
+        )
     }
     fn i32_atomic_load_8u(
         &mut self,
-        _addr: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_load_8u unimplemented");
+        self.memory_op(
+            addr,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| this.emit_relaxed_ldr8(Size::S32, ret, Location::Memory(addr, 0)),
+        )
     }
     fn i32_atomic_load_16u(
         &mut self,
-        _addr: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_load_16u unimplemented");
+        self.memory_op(
+            addr,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| this.emit_relaxed_ldr16(Size::S32, ret, Location::Memory(addr, 0)),
+        )
     }
     fn i32_save(
         &mut self,
@@ -3401,361 +3434,1321 @@ impl Machine for MachineARM64 {
     }
     fn i32_atomic_save(
         &mut self,
-        _value: Location,
-        _memarg: &MemoryImmediate,
-        _target_addr: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_save unimplemented");
+        self.memory_op(
+            target_addr,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| this.emit_relaxed_str32(target_value, Location::Memory(addr, 0)),
+        )?;
+        self.assembler.emit_dmb()
     }
     fn i32_atomic_save_8(
         &mut self,
-        _value: Location,
-        _memarg: &MemoryImmediate,
-        _target_addr: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_save_8 unimplemented");
+        self.memory_op(
+            target_addr,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| this.emit_relaxed_str8(target_value, Location::Memory(addr, 0)),
+        )?;
+        self.assembler.emit_dmb()
     }
     fn i32_atomic_save_16(
         &mut self,
-        _value: Location,
-        _memarg: &MemoryImmediate,
-        _target_addr: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_save_16 unimplemented");
+        self.memory_op(
+            target_addr,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| this.emit_relaxed_str16(target_value, Location::Memory(addr, 0)),
+        )?;
+        self.assembler.emit_dmb()
     }
     // i32 atomic Add with i32
     fn i32_atomic_add(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_add unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_add32(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxr(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Add with u8
     fn i32_atomic_add_8u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_add_8u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrb(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_add32(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrb(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Add with u16
     fn i32_atomic_add_16u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_add_16u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrh(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_add32(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrh(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Sub with i32
     fn i32_atomic_sub(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_sub unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_sub32(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxr(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Sub with u8
     fn i32_atomic_sub_8u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_sub_8u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrb(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_sub32(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrb(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Sub with u16
     fn i32_atomic_sub_16u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_sub_16u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrh(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_sub32(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrh(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic And with i32
     fn i32_atomic_and(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_and unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_and32(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxr(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic And with u8
     fn i32_atomic_and_8u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_and_8u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrb(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_and32(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrb(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic And with u16
     fn i32_atomic_and_16u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_and_16u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrh(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_and32(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrh(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Or with i32
     fn i32_atomic_or(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_or unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_or32(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxr(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Or with u8
     fn i32_atomic_or_8u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_or_8u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrb(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_or32(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrb(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Or with u16
     fn i32_atomic_or_16u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_or_16u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrh(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_or32(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrh(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Xor with i32
     fn i32_atomic_xor(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_xor unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_xor32(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxr(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Xor with u8
     fn i32_atomic_xor_8u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_xor_8u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrb(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_xor32(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrb(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Xor with u16
     fn i32_atomic_xor_16u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_xor_16u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrh(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_xor32(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrh(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Exchange with i32
     fn i32_atomic_xchg(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_xchg unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let org =
+                    this.location_to_reg(Size::S32, loc, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S32, dst, Location::GPR(addr))?;
+                this.assembler.emit_stlxr(
+                    Size::S32,
+                    Location::GPR(tmp),
+                    org,
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Exchange with u8
     fn i32_atomic_xchg_8u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_xchg_8u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let org =
+                    this.location_to_reg(Size::S32, loc, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrb(Size::S32, dst, Location::GPR(addr))?;
+                this.assembler.emit_stlxrb(
+                    Size::S32,
+                    Location::GPR(tmp),
+                    org,
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Exchange with u16
     fn i32_atomic_xchg_16u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_xchg_16u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let org =
+                    this.location_to_reg(Size::S32, loc, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrh(Size::S32, dst, Location::GPR(addr))?;
+                this.assembler.emit_stlxrh(
+                    Size::S32,
+                    Location::GPR(tmp),
+                    org,
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Exchange with i32
     fn i32_atomic_cmpxchg(
         &mut self,
-        _new: Location,
-        _cmp: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        new: Location,
+        cmp: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_cmpxchg unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let org =
+                    this.location_to_reg(Size::S32, new, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+                let nosame = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_relaxed_cmp(Size::S32, dst, cmp)?;
+                this.assembler.emit_bcond_label(Condition::Ne, nosame)?;
+                this.assembler.emit_stlxr(
+                    Size::S32,
+                    Location::GPR(tmp),
+                    org,
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp), reread)?;
+                this.assembler.emit_dmb()?;
+
+                this.emit_label(nosame)?;
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Exchange with u8
     fn i32_atomic_cmpxchg_8u(
         &mut self,
-        _new: Location,
-        _cmp: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        new: Location,
+        cmp: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_cmpxchg_8u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let org =
+                    this.location_to_reg(Size::S32, new, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+                let nosame = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrb(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_relaxed_cmp(Size::S32, dst, cmp)?;
+                this.assembler.emit_bcond_label(Condition::Ne, nosame)?;
+                this.assembler.emit_stlxrb(
+                    Size::S32,
+                    Location::GPR(tmp),
+                    org,
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp), reread)?;
+                this.assembler.emit_dmb()?;
+
+                this.emit_label(nosame)?;
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i32 atomic Exchange with u16
     fn i32_atomic_cmpxchg_16u(
         &mut self,
-        _new: Location,
-        _cmp: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        new: Location,
+        cmp: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i32_atomic_cmpxchg_16u unimplemented");
-    }
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
+                let org =
+                    this.location_to_reg(Size::S32, new, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+                let nosame = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrh(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_relaxed_cmp(Size::S32, dst, cmp)?;
+                this.assembler.emit_bcond_label(Condition::Ne, nosame)?;
+                this.assembler.emit_stlxrh(
+                    Size::S32,
+                    Location::GPR(tmp),
+                    org,
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp), reread)?;
+                this.assembler.emit_dmb()?;
+
+                this.emit_label(nosame)?;
+                if dst != ret {
+                    this.move_location(Size::S32, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
+    }
 
     fn emit_call_with_reloc(
         &mut self,
@@ -4420,55 +5413,99 @@ impl Machine for MachineARM64 {
     }
     fn i64_atomic_load(
         &mut self,
-        _addr: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_load unimplemented");
+        self.memory_op(
+            addr,
+            memarg,
+            true,
+            8,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| this.emit_relaxed_ldr64(Size::S64, ret, Location::Memory(addr, 0)),
+        )
     }
     fn i64_atomic_load_8u(
         &mut self,
-        _addr: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_load_8u unimplemented");
+        self.memory_op(
+            addr,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| this.emit_relaxed_ldr8(Size::S64, ret, Location::Memory(addr, 0)),
+        )
     }
     fn i64_atomic_load_16u(
         &mut self,
-        _addr: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_load_16u unimplemented");
+        self.memory_op(
+            addr,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| this.emit_relaxed_ldr16(Size::S64, ret, Location::Memory(addr, 0)),
+        )
     }
     fn i64_atomic_load_32u(
         &mut self,
-        _addr: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_load_32u unimplemented");
+        self.memory_op(
+            addr,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| this.emit_relaxed_ldr32(Size::S64, ret, Location::Memory(addr, 0)),
+        )
     }
     fn i64_save(
         &mut self,
@@ -4568,479 +5605,1759 @@ impl Machine for MachineARM64 {
     }
     fn i64_atomic_save(
         &mut self,
-        _value: Location,
-        _memarg: &MemoryImmediate,
-        _target_addr: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_save unimplemented");
+        self.memory_op(
+            target_addr,
+            memarg,
+            true,
+            8,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| this.emit_relaxed_str64(target_value, Location::Memory(addr, 0)),
+        )?;
+        self.assembler.emit_dmb()
     }
     fn i64_atomic_save_8(
         &mut self,
-        _value: Location,
-        _memarg: &MemoryImmediate,
-        _target_addr: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_save_8 unimplemented");
+        self.memory_op(
+            target_addr,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| this.emit_relaxed_str8(target_value, Location::Memory(addr, 0)),
+        )?;
+        self.assembler.emit_dmb()
     }
     fn i64_atomic_save_16(
         &mut self,
-        _value: Location,
-        _memarg: &MemoryImmediate,
-        _target_addr: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_save_16 unimplemented");
+        self.memory_op(
+            target_addr,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| this.emit_relaxed_str16(target_value, Location::Memory(addr, 0)),
+        )?;
+        self.assembler.emit_dmb()
     }
     fn i64_atomic_save_32(
         &mut self,
-        _value: Location,
-        _memarg: &MemoryImmediate,
-        _target_addr: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_save_32 unimplemented");
+        self.memory_op(
+            target_addr,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| this.emit_relaxed_str32(target_value, Location::Memory(addr, 0)),
+        )?;
+        self.assembler.emit_dmb()
     }
     // i64 atomic Add with i64
     fn i64_atomic_add(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
-    ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_add unimplemented");
-    }
-    // i64 atomic Add with u8
-    fn i64_atomic_add_8u(
-        &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
+    ) -> Result<(), CompileError> {
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            8,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_binop_add64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxr(
+                    Size::S64,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
+    }
+    // i64 atomic Add with u8
+    fn i64_atomic_add_8u(
+        &mut self,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_add_8u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrb(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_binop_add64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrb(
+                    Size::S64,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Add with u16
     fn i64_atomic_add_16u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_add_16u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrh(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_binop_add64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrh(
+                    Size::S64,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Add with u32
     fn i64_atomic_add_32u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_add_32u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_add64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxr(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Sub with i64
     fn i64_atomic_sub(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_sub unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            8,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_binop_sub64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxr(
+                    Size::S64,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Sub with u8
     fn i64_atomic_sub_8u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_sub_8u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrb(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_binop_sub64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrb(
+                    Size::S64,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Sub with u16
     fn i64_atomic_sub_16u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_sub_16u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrh(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_binop_sub64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrh(
+                    Size::S64,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Sub with u32
     fn i64_atomic_sub_32u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_sub_32u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_sub64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxr(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic And with i64
     fn i64_atomic_and(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_and unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            8,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_binop_and64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxr(
+                    Size::S64,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic And with u8
     fn i64_atomic_and_8u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_and_8u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrb(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_binop_and64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrb(
+                    Size::S64,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic And with u16
     fn i64_atomic_and_16u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_and_16u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrh(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_binop_and64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrh(
+                    Size::S64,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic And with u32
     fn i64_atomic_and_32u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_and_32u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_and64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxr(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Or with i64
     fn i64_atomic_or(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_or unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            8,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_binop_or64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxr(
+                    Size::S64,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Or with u8
     fn i64_atomic_or_8u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_or_8u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrb(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_binop_or64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrb(
+                    Size::S64,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Or with u16
     fn i64_atomic_or_16u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_or_16u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrh(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_binop_or64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrh(
+                    Size::S64,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Or with u32
     fn i64_atomic_or_32u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_or_32u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_or64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxr(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
-    // i64 atomic xor with i64
+    // i64 atomic Xor with i64
     fn i64_atomic_xor(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_xor unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            8,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_binop_xor64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxr(
+                    Size::S64,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
-    // i64 atomic xor with u8
+    // i64 atomic Xor with u8
     fn i64_atomic_xor_8u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_xor_8u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrb(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_binop_xor64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrb(
+                    Size::S64,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
-    // i64 atomic xor with u16
+    // i64 atomic Xor with u16
     fn i64_atomic_xor_16u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_xor_16u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrh(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_binop_xor64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxrh(
+                    Size::S64,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
-    // i64 atomic xor with u32
+    // i64 atomic Xor with u32
     fn i64_atomic_xor_32u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_xor_32u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp1 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let tmp2 = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_binop_xor64(dst, loc, Location::GPR(tmp1))?;
+                this.assembler.emit_stlxr(
+                    Size::S32,
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp1),
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp2), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Exchange with i64
     fn i64_atomic_xchg(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_xchg unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            8,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let org =
+                    this.location_to_reg(Size::S64, loc, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S64, dst, Location::GPR(addr))?;
+                this.assembler.emit_stlxr(
+                    Size::S64,
+                    Location::GPR(tmp),
+                    org,
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Exchange with u8
     fn i64_atomic_xchg_8u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_xchg_8u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let org =
+                    this.location_to_reg(Size::S64, loc, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrb(Size::S64, dst, Location::GPR(addr))?;
+                this.assembler.emit_stlxrb(
+                    Size::S64,
+                    Location::GPR(tmp),
+                    org,
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Exchange with u16
     fn i64_atomic_xchg_16u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_xchg_16u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let org =
+                    this.location_to_reg(Size::S64, loc, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrh(Size::S64, dst, Location::GPR(addr))?;
+                this.assembler.emit_stlxrh(
+                    Size::S64,
+                    Location::GPR(tmp),
+                    org,
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Exchange with u32
     fn i64_atomic_xchg_32u(
         &mut self,
-        _loc: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        loc: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_xchg_32u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let org =
+                    this.location_to_reg(Size::S64, loc, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S32, dst, Location::GPR(addr))?;
+                this.assembler.emit_stlxr(
+                    Size::S32,
+                    Location::GPR(tmp),
+                    org,
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp), reread)?;
+                this.assembler.emit_dmb()?;
+
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Exchange with i64
     fn i64_atomic_cmpxchg(
         &mut self,
-        _new: Location,
-        _cmp: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        new: Location,
+        cmp: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_cmpxchg unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            8,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let org =
+                    this.location_to_reg(Size::S64, new, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+                let nosame = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_relaxed_cmp(Size::S64, dst, cmp)?;
+                this.assembler.emit_bcond_label(Condition::Ne, nosame)?;
+                this.assembler.emit_stlxr(
+                    Size::S64,
+                    Location::GPR(tmp),
+                    org,
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp), reread)?;
+                this.assembler.emit_dmb()?;
+
+                this.emit_label(nosame)?;
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Exchange with u8
     fn i64_atomic_cmpxchg_8u(
         &mut self,
-        _new: Location,
-        _cmp: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        new: Location,
+        cmp: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_cmpxchg_8u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let org =
+                    this.location_to_reg(Size::S64, new, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+                let nosame = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrb(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_relaxed_cmp(Size::S64, dst, cmp)?;
+                this.assembler.emit_bcond_label(Condition::Ne, nosame)?;
+                this.assembler.emit_stlxrb(
+                    Size::S64,
+                    Location::GPR(tmp),
+                    org,
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp), reread)?;
+                this.assembler.emit_dmb()?;
+
+                this.emit_label(nosame)?;
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Exchange with u16
     fn i64_atomic_cmpxchg_16u(
         &mut self,
-        _new: Location,
-        _cmp: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        new: Location,
+        cmp: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_cmpxchg_16u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let org =
+                    this.location_to_reg(Size::S64, new, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+                let nosame = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxrh(Size::S64, dst, Location::GPR(addr))?;
+                this.emit_relaxed_cmp(Size::S64, dst, cmp)?;
+                this.assembler.emit_bcond_label(Condition::Ne, nosame)?;
+                this.assembler.emit_stlxrh(
+                    Size::S64,
+                    Location::GPR(tmp),
+                    org,
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp), reread)?;
+                this.assembler.emit_dmb()?;
+
+                this.emit_label(nosame)?;
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
     // i64 atomic Exchange with u32
     fn i64_atomic_cmpxchg_32u(
         &mut self,
-        _new: Location,
-        _cmp: Location,
-        _target: Location,
-        _memarg: &MemoryImmediate,
-        _ret: Location,
-        _need_check: bool,
-        _imported_memories: bool,
-        _offset: i32,
-        _heap_access_oob: Label,
-        _unaligned_atomic: Label,
+        new: Location,
+        cmp: Location,
+        target: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        unaligned_atomic: Label,
     ) -> Result<(), CompileError> {
-        codegen_error!("singlepass i64_atomic_cmpxchg_32u unimplemented");
+        self.memory_op(
+            target,
+            memarg,
+            true,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            unaligned_atomic,
+            |this, addr| {
+                let mut temps = vec![];
+                let tmp = this.acquire_temp_gpr().ok_or_else(|| {
+                    CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
+                })?;
+                let dst =
+                    this.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
+                let org =
+                    this.location_to_reg(Size::S64, new, &mut temps, ImmType::None, false, None)?;
+                let reread = this.get_label();
+                let nosame = this.get_label();
+
+                this.emit_label(reread)?;
+                this.assembler
+                    .emit_ldaxr(Size::S32, dst, Location::GPR(addr))?;
+                this.emit_relaxed_cmp(Size::S64, dst, cmp)?;
+                this.assembler.emit_bcond_label(Condition::Ne, nosame)?;
+                this.assembler.emit_stlxr(
+                    Size::S32,
+                    Location::GPR(tmp),
+                    org,
+                    Location::GPR(addr),
+                )?;
+                this.assembler
+                    .emit_cbnz_label(Size::S32, Location::GPR(tmp), reread)?;
+                this.assembler.emit_dmb()?;
+
+                this.emit_label(nosame)?;
+                if dst != ret {
+                    this.move_location(Size::S64, ret, dst)?;
+                }
+                for r in temps {
+                    this.release_gpr(r);
+                }
+                Ok(())
+            },
+        )
     }
 
     fn f32_load(
diff --git a/lib/vm/src/lib.rs b/lib/vm/src/lib.rs
index 8d5602fefed..3b1abc55127 100644
--- a/lib/vm/src/lib.rs
+++ b/lib/vm/src/lib.rs
@@ -45,7 +45,9 @@ pub use crate::function_env::VMFunctionEnvironment;
 pub use crate::global::*;
 pub use crate::imports::Imports;
 pub use crate::instance::{InstanceAllocator, InstanceHandle};
-pub use crate::memory::{initialize_memory_with_data, LinearMemory, VMMemory, VMOwnedMemory, VMSharedMemory};
+pub use crate::memory::{
+    initialize_memory_with_data, LinearMemory, VMMemory, VMOwnedMemory, VMSharedMemory,
+};
 pub use crate::mmap::Mmap;
 pub use crate::probestack::PROBESTACK;
 pub use crate::sig_registry::SignatureRegistry;
diff --git a/tests/ignores.txt b/tests/ignores.txt
index 08a5982bdc0..b84bbeaa442 100644
--- a/tests/ignores.txt
+++ b/tests/ignores.txt
@@ -24,7 +24,6 @@ llvm       traps::start_trap_pretty
 cranelift+aarch64+macos    traps::start_trap_pretty
 
 # Atomics (WIP)
-singlepass+aarch64 spec::threads::atomic
 singlepass spec::threads::imports
 cranelift spec::threads::imports
 llvm spec::threads::imports

From e4523f7e6ccdb6a8d5ae800375cce6b9c8c5cc2b Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Mon, 10 Oct 2022 15:07:28 +0200
Subject: [PATCH 15/24] Fixed issue #3167 and added the relevant test

---
 .../src/translator/code_translator.rs         | 21 +++++++++++++++----
 tests/wast/wasmer/README.md                   |  6 +++++-
 tests/wast/wasmer/atomic_load.wast            |  9 ++++++++
 3 files changed, 31 insertions(+), 5 deletions(-)
 create mode 100755 tests/wast/wasmer/atomic_load.wast

diff --git a/lib/compiler-cranelift/src/translator/code_translator.rs b/lib/compiler-cranelift/src/translator/code_translator.rs
index e38640fb3e0..6b42a8e6c6e 100644
--- a/lib/compiler-cranelift/src/translator/code_translator.rs
+++ b/lib/compiler-cranelift/src/translator/code_translator.rs
@@ -2408,11 +2408,24 @@ fn finalise_atomic_mem_addr<FE: FuncEnvironment + ?Sized>(
     state: &mut FuncTranslationState,
     environ: &mut FE,
 ) -> WasmResult<Value> {
-    // Check the alignment of `linear_mem_addr`.
     let access_ty_bytes = access_ty.bytes();
-    let final_lma = builder
-        .ins()
-        .iadd_imm(linear_mem_addr, memarg.offset as i64);
+    let final_lma = if memarg.offset > 0 {
+        assert!(builder.func.dfg.value_type(linear_mem_addr) == I32);
+        let linear_mem_addr = builder.ins().uextend(I64, linear_mem_addr);
+        let a = builder
+            .ins()
+            .iadd_imm(linear_mem_addr, memarg.offset as i64);
+        let cflags = builder.ins().ifcmp_imm(a, 0x1_0000_0000i64);
+        builder.ins().trapif(
+            IntCC::UnsignedGreaterThanOrEqual,
+            cflags,
+            ir::TrapCode::HeapOutOfBounds,
+        );
+        builder.ins().ireduce(I32, a)
+    } else {
+        linear_mem_addr
+    };
+    // Check the alignment of `linear_mem_addr`.
     if access_ty_bytes != 1 {
         assert!(access_ty_bytes == 2 || access_ty_bytes == 4 || access_ty_bytes == 8);
         let final_lma_misalignment = builder
diff --git a/tests/wast/wasmer/README.md b/tests/wast/wasmer/README.md
index 60c933dfb2c..17d398c6aa3 100644
--- a/tests/wast/wasmer/README.md
+++ b/tests/wast/wasmer/README.md
@@ -31,4 +31,8 @@ front, not once in each call.
 
 ## Divide by Zero: `divide.wast`
 
-This is a simple test to check that a divide by zero is correctly trapped
\ No newline at end of file
+This is a simple test to check that a divide by zero is correctly trapped
+
+## Atomic Load: `atomic_load.wast`
+
+This is a simple test to check that load an atomic "to far" in memory trigger a OutOfBound trap
diff --git a/tests/wast/wasmer/atomic_load.wast b/tests/wast/wasmer/atomic_load.wast
new file mode 100755
index 00000000000..932b39a1da0
--- /dev/null
+++ b/tests/wast/wasmer/atomic_load.wast
@@ -0,0 +1,9 @@
+(module
+    (memory 1)
+    (func (export "atomic_load")
+        i32.const 0xffff_fff0
+        i32.atomic.load offset=16
+        drop
+    )
+)
+(assert_trap (invoke "atomic_load") "out of bound")

From 519f29b7a8fc79ec9a42db87260982613b4142b0 Mon Sep 17 00:00:00 2001
From: Syrus Akbary <me@syrusakbary.com>
Date: Mon, 10 Oct 2022 16:19:34 +0200
Subject: [PATCH 16/24] Added exists function to imports

---
 lib/api/src/sys/imports.rs | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/lib/api/src/sys/imports.rs b/lib/api/src/sys/imports.rs
index 18eef148f2c..605da276bc8 100644
--- a/lib/api/src/sys/imports.rs
+++ b/lib/api/src/sys/imports.rs
@@ -55,9 +55,7 @@ impl Imports {
     /// import_object.get_export("module", "name");
     /// ```
     pub fn get_export(&self, module: &str, name: &str) -> Option<Extern> {
-        if self
-            .map
-            .contains_key(&(module.to_string(), name.to_string()))
+        if self.exists(module, name)
         {
             let ext = &self.map[&(module.to_string(), name.to_string())];
             return Some(ext.clone());
@@ -65,6 +63,20 @@ impl Imports {
         None
     }
 
+    /// Returns if an export exist for a given module and name.
+    ///
+    /// # Usage
+    /// ```no_run
+    /// # use wasmer::Imports;
+    /// let mut import_object = Imports::new();
+    /// import_object.exists("module", "name");
+    /// ```
+    pub fn exists(&self, module: &str, name: &str) -> bool {
+        self
+            .map
+            .contains_key(&(module.to_string(), name.to_string()))
+    }
+
     /// Returns true if the Imports contains namespace with the provided name.
     pub fn contains_namespace(&self, name: &str) -> bool {
         self.map.keys().any(|(k, _)| (k == name))

From b86f129c4466502dd8e9818165792182354fcd81 Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Mon, 10 Oct 2022 18:23:06 +0200
Subject: [PATCH 17/24] Moved import_shared_memory to wasi specific, and use
 this function on the run/wasi cli command directly

Fixed linter

Fixed js build

Fixed linter
---
 lib/api/src/sys/imports.rs       | 53 +++-----------------------------
 lib/api/src/sys/instance.rs      |  2 +-
 lib/cli/src/commands/run/wasi.rs |  7 +++--
 lib/wasi/src/lib.rs              |  2 ++
 lib/wasi/src/utils.rs            | 42 ++++++++++++++++++++++++-
 5 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/lib/api/src/sys/imports.rs b/lib/api/src/sys/imports.rs
index 605da276bc8..8e988dadeef 100644
--- a/lib/api/src/sys/imports.rs
+++ b/lib/api/src/sys/imports.rs
@@ -1,12 +1,11 @@
 //! The import module contains the implementation data structures and helper functions used to
 //! manipulate and access a wasm module's imports including memories, tables, globals, and
 //! functions.
-use crate::{AsStoreMut, Exports, Extern, Memory, Module};
+use crate::{Exports, Extern, Module};
 use std::collections::HashMap;
 use std::fmt;
 use wasmer_compiler::LinkError;
 use wasmer_types::ImportError;
-use wasmer_vm::VMSharedMemory;
 
 /// All of the import data used when instantiating.
 ///
@@ -55,8 +54,7 @@ impl Imports {
     /// import_object.get_export("module", "name");
     /// ```
     pub fn get_export(&self, module: &str, name: &str) -> Option<Extern> {
-        if self.exists(module, name)
-        {
+        if self.exists(module, name) {
             let ext = &self.map[&(module.to_string(), name.to_string())];
             return Some(ext.clone());
         }
@@ -72,8 +70,7 @@ impl Imports {
     /// import_object.exists("module", "name");
     /// ```
     pub fn exists(&self, module: &str, name: &str) -> bool {
-        self
-            .map
+        self.map
             .contains_key(&(module.to_string(), name.to_string()))
     }
 
@@ -124,41 +121,6 @@ impl Imports {
             .insert((ns.to_string(), name.to_string()), val.into());
     }
 
-    /// Imports (any) shared memory into the imports.
-    /// (if the module does not import memory then this function is ignored)
-    pub(crate) fn import_shared_memory(
-        &self,
-        module: &Module,
-        store: &mut impl AsStoreMut,
-    ) -> Self {
-        // Determine if shared memory needs to be created and imported
-        let shared_memory = module
-            .imports()
-            .memories()
-            .next()
-            .map(|a| *a.ty())
-            .map(|ty| {
-                let style = store.as_store_ref().tunables().memory_style(&ty);
-                VMSharedMemory::new(&ty, &style).unwrap()
-            });
-
-        let mut ret = self.clone();
-        if let Some(memory) = shared_memory {
-            // if the memory has already be defined, don't redefine it!
-            if !self
-                .map
-                .contains_key(&("env".to_string(), "memory".to_string()))
-            {
-                ret.define(
-                    "env",
-                    "memory",
-                    Memory::new_from_existing(store, memory.into()),
-                );
-            }
-        };
-        ret
-    }
-
     /// Returns the contents of a namespace as an `Exports`.
     ///
     /// Returns `None` if the namespace doesn't exist.
@@ -179,15 +141,10 @@ impl Imports {
     /// Resolve and return a vector of imports in the order they are defined in the `module`'s source code.
     ///
     /// This means the returned `Vec<Extern>` might be a subset of the imports contained in `self`.
-    pub fn imports_for_module(
-        &self,
-        module: &Module,
-        store: &mut impl AsStoreMut,
-    ) -> Result<Vec<Extern>, LinkError> {
+    pub fn imports_for_module(&self, module: &Module) -> Result<Vec<Extern>, LinkError> {
         let mut ret = vec![];
-        let imports = self.import_shared_memory(module, store);
         for import in module.imports() {
-            if let Some(imp) = imports
+            if let Some(imp) = self
                 .map
                 .get(&(import.module().to_string(), import.name().to_string()))
             {
diff --git a/lib/api/src/sys/instance.rs b/lib/api/src/sys/instance.rs
index 315191128e6..ab8e9d5c293 100644
--- a/lib/api/src/sys/instance.rs
+++ b/lib/api/src/sys/instance.rs
@@ -116,7 +116,7 @@ impl Instance {
         imports: &Imports,
     ) -> Result<Self, InstantiationError> {
         let imports = imports
-            .imports_for_module(module, store)
+            .imports_for_module(module)
             .map_err(InstantiationError::Link)?;
         let mut handle = module.instantiate(store, &imports)?;
         let exports = module
diff --git a/lib/cli/src/commands/run/wasi.rs b/lib/cli/src/commands/run/wasi.rs
index ffc70c42036..af78dabe31d 100644
--- a/lib/cli/src/commands/run/wasi.rs
+++ b/lib/cli/src/commands/run/wasi.rs
@@ -4,8 +4,8 @@ use std::collections::BTreeSet;
 use std::path::PathBuf;
 use wasmer::{AsStoreMut, FunctionEnv, Instance, Module, RuntimeError, Value};
 use wasmer_wasi::{
-    get_wasi_versions, import_object_for_all_wasi_versions, is_wasix_module, WasiEnv, WasiError,
-    WasiState, WasiVersion,
+    get_wasi_versions, import_object_for_all_wasi_versions, is_wasix_module,
+    wasi_import_shared_memory, WasiEnv, WasiError, WasiState, WasiVersion,
 };
 
 use clap::Parser;
@@ -104,7 +104,8 @@ impl Wasi {
             is_wasix_module(module),
             std::sync::atomic::Ordering::Release,
         );
-        let import_object = import_object_for_all_wasi_versions(store, &wasi_env.env);
+        let mut import_object = import_object_for_all_wasi_versions(store, &wasi_env.env);
+        wasi_import_shared_memory(&mut import_object, module, store);
         let instance = Instance::new(store, module, &import_object)?;
         let memory = instance.exports.get_memory("memory")?;
         wasi_env.data_mut(store).set_memory(memory.clone());
diff --git a/lib/wasi/src/lib.rs b/lib/wasi/src/lib.rs
index 81a77fc7fcd..787bc239158 100644
--- a/lib/wasi/src/lib.rs
+++ b/lib/wasi/src/lib.rs
@@ -54,7 +54,9 @@ pub use crate::state::{
 pub use crate::syscalls::types;
 #[cfg(feature = "wasix")]
 pub use crate::utils::is_wasix_module;
+pub use crate::utils::wasi_import_shared_memory;
 pub use crate::utils::{get_wasi_version, get_wasi_versions, is_wasi_module, WasiVersion};
+
 pub use wasmer_vbus::{UnsupportedVirtualBus, VirtualBus};
 #[deprecated(since = "2.1.0", note = "Please use `wasmer_vfs::FsError`")]
 pub use wasmer_vfs::FsError as WasiFsError;
diff --git a/lib/wasi/src/utils.rs b/lib/wasi/src/utils.rs
index 571a954fc38..b05c5c302db 100644
--- a/lib/wasi/src/utils.rs
+++ b/lib/wasi/src/utils.rs
@@ -1,5 +1,7 @@
 use std::collections::BTreeSet;
-use wasmer::Module;
+#[cfg(not(feature = "js"))]
+use wasmer::vm::VMSharedMemory;
+use wasmer::{AsStoreMut, Imports, Memory, Module};
 use wasmer_wasi_types::wasi::Errno;
 
 #[allow(dead_code)]
@@ -48,6 +50,44 @@ pub fn map_io_err(err: std::io::Error) -> Errno {
     }
 }
 
+/// Imports (any) shared memory into the imports.
+/// (if the module does not import memory then this function is ignored)
+#[cfg(not(feature = "js"))]
+pub fn wasi_import_shared_memory(
+    imports: &mut Imports,
+    module: &Module,
+    store: &mut impl AsStoreMut,
+) {
+    // Determine if shared memory needs to be created and imported
+    let shared_memory = module
+        .imports()
+        .memories()
+        .next()
+        .map(|a| *a.ty())
+        .map(|ty| {
+            let style = store.as_store_ref().tunables().memory_style(&ty);
+            VMSharedMemory::new(&ty, &style).unwrap()
+        });
+
+    if let Some(memory) = shared_memory {
+        // if the memory has already be defined, don't redefine it!
+        if !imports.exists("env", "memory") {
+            imports.define(
+                "env",
+                "memory",
+                Memory::new_from_existing(store, memory.into()),
+            );
+        }
+    };
+}
+#[cfg(feature = "js")]
+pub fn wasi_import_shared_memory(
+    _imports: &mut Imports,
+    _module: &Module,
+    _store: &mut impl AsStoreMut,
+) {
+}
+
 /// The version of WASI. This is determined by the imports namespace
 /// string.
 #[derive(Debug, Clone, Copy, Eq)]

From 3144ef2b3d49799e172317128548c9066406f1fa Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Wed, 12 Oct 2022 20:18:07 +0200
Subject: [PATCH 18/24] Enable threads/imports test, but disabling multiple
 table individual tests as it's not yet supported

---
 tests/compilers/wast.rs                       |  8 ++++++
 tests/ignores.txt                             |  5 ----
 tests/lib/wast/src/spectest.rs                |  4 +++
 .../wast/spec/proposals/threads/imports.wast  | 26 +++++++++----------
 4 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/tests/compilers/wast.rs b/tests/compilers/wast.rs
index 576e62e19fa..a27d24b56f5 100644
--- a/tests/compilers/wast.rs
+++ b/tests/compilers/wast.rs
@@ -22,12 +22,16 @@ pub fn run_wast(mut config: crate::Config, wast_path: &str) -> anyhow::Result<()
     let mut features = Features::default();
     let is_bulkmemory = wast_path.contains("bulk-memory");
     let is_simd = wast_path.contains("simd");
+    let is_threads = wast_path.contains("threads");
     if is_bulkmemory {
         features.bulk_memory(true);
     }
     if is_simd {
         features.simd(true);
     }
+    if is_threads {
+        features.threads(true);
+    }
     if config.compiler == crate::Compiler::Singlepass {
         features.multi_value(false);
     }
@@ -53,6 +57,10 @@ pub fn run_wast(mut config: crate::Config, wast_path: &str) -> anyhow::Result<()
             "Validation error: Invalid var_u32",
         ]);
     }
+    if is_threads {
+        // We allow this, so tests can be run properly for `simd_const` test.
+        wast.allow_instantiation_failures(&["Validation error: multiple tables"]);
+    }
     if config.compiler == crate::Compiler::Singlepass {
         // We don't support multivalue yet in singlepass
         wast.allow_instantiation_failures(&[
diff --git a/tests/ignores.txt b/tests/ignores.txt
index b84bbeaa442..39978d01cd9 100644
--- a/tests/ignores.txt
+++ b/tests/ignores.txt
@@ -23,11 +23,6 @@ singlepass+aarch64+macos traps::start_trap_pretty
 llvm       traps::start_trap_pretty
 cranelift+aarch64+macos    traps::start_trap_pretty
 
-# Atomics (WIP)
-singlepass spec::threads::imports
-cranelift spec::threads::imports
-llvm spec::threads::imports
-
 # Also neither LLVM nor Cranelift currently implement stack probing on AArch64.
 # https://github.com/wasmerio/wasmer/issues/2808
 cranelift+aarch64 spec::skip_stack_guard_page
diff --git a/tests/lib/wast/src/spectest.rs b/tests/lib/wast/src/spectest.rs
index 9c2433ecd48..b4d44938491 100644
--- a/tests/lib/wast/src/spectest.rs
+++ b/tests/lib/wast/src/spectest.rs
@@ -28,6 +28,9 @@ pub fn spectest_importobject(store: &mut Store) -> Imports {
     let ty = MemoryType::new(1, Some(2), false);
     let memory = Memory::new(store, ty).unwrap();
 
+    let ty = MemoryType::new(1, Some(2), true);
+    let shared_memory = Memory::new(store, ty).unwrap();
+
     imports! {
         "spectest" => {
             "print" => print,
@@ -43,6 +46,7 @@ pub fn spectest_importobject(store: &mut Store) -> Imports {
             "global_f64" => global_f64,
             "table" => table,
             "memory" => memory,
+            "shared_memory" => shared_memory,
         },
     }
 }
diff --git a/tests/wast/spec/proposals/threads/imports.wast b/tests/wast/spec/proposals/threads/imports.wast
index 51dfbceaa28..4567171c782 100644
--- a/tests/wast/spec/proposals/threads/imports.wast
+++ b/tests/wast/spec/proposals/threads/imports.wast
@@ -305,19 +305,19 @@
 (assert_trap (invoke "call" (i32.const 3)) "uninitialized element")
 (assert_trap (invoke "call" (i32.const 100)) "undefined element")
 
-
-(assert_invalid
-  (module (import "" "" (table 10 funcref)) (import "" "" (table 10 funcref)))
-  "multiple tables"
-)
-(assert_invalid
-  (module (import "" "" (table 10 funcref)) (table 10 funcref))
-  "multiple tables"
-)
-(assert_invalid
-  (module (table 10 funcref) (table 10 funcref))
-  "multiple tables"
-)
+;; No multiple table yet.
+;;(assert_invalid
+;;  (module (import "" "" (table 10 funcref)) (import "" "" (table 10 funcref)))
+;;  "multiple tables"
+;;)
+;;(assert_invalid
+;;  (module (import "" "" (table 10 funcref)) (table 10 funcref))
+;;  "multiple tables"
+;;)
+;;(assert_invalid
+;;  (module (table 10 funcref) (table 10 funcref))
+;;  "multiple tables"
+;;)
 
 (module (import "test" "table-10-inf" (table 10 funcref)))
 (module (import "test" "table-10-inf" (table 5 funcref)))

From fce0ac907d34ac5f3583e25fae43057cd669923a Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Thu, 24 Nov 2022 18:11:00 +0100
Subject: [PATCH 19/24] Update lib/vm/src/instance/mod.rs

Co-authored-by: Christoph Herzog <a.github@omega-id.com>
---
 lib/vm/src/instance/mod.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/vm/src/instance/mod.rs b/lib/vm/src/instance/mod.rs
index 878b50c47ec..d67eb7bbe02 100644
--- a/lib/vm/src/instance/mod.rs
+++ b/lib/vm/src/instance/mod.rs
@@ -795,8 +795,7 @@ impl Instance {
         // fetch the notifier
         let key = (index, dst);
         let mut conds = self.conditions.lock().unwrap();
-        conds.entry(key).or_insert_with(Vec::new);
-        let v = conds.get_mut(&key).unwrap();
+        let v = conds.entry(key).or_insert_with(Vec::new);
         v.push((current(), false));
         drop(conds);
         if timeout < 0 {

From dba6a6a0ff7e0c1a82526fdce925156d5ae9e99c Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Thu, 24 Nov 2022 18:11:58 +0100
Subject: [PATCH 20/24] Update lib/vm/src/instance/mod.rs

Co-authored-by: Christoph Herzog <a.github@omega-id.com>
---
 lib/vm/src/instance/mod.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/vm/src/instance/mod.rs b/lib/vm/src/instance/mod.rs
index d67eb7bbe02..859b4e56f80 100644
--- a/lib/vm/src/instance/mod.rs
+++ b/lib/vm/src/instance/mod.rs
@@ -939,8 +939,7 @@ impl Instance {
         let key = (memory_index.as_u32(), dst);
         let mut conds = self.conditions.lock().unwrap();
         let mut cnt = 0u32;
-        if conds.contains_key(&key) {
-            let v = conds.get_mut(&key).unwrap();
+        if let Some(v) = conds.get_mut(&key) {
             for (t, b) in v {
                 if cnt < count {
                     *b = true; // mark as was waiked up

From 23029d894174360b36782b78b7b3a6a54225c05e Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Fri, 25 Nov 2022 10:55:27 +0100
Subject: [PATCH 21/24] Refactore the Wait/Notify HashMap for more clarity

---
 lib/vm/src/instance/mod.rs | 68 +++++++++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 20 deletions(-)

diff --git a/lib/vm/src/instance/mod.rs b/lib/vm/src/instance/mod.rs
index 859b4e56f80..c025e85016e 100644
--- a/lib/vm/src/instance/mod.rs
+++ b/lib/vm/src/instance/mod.rs
@@ -42,6 +42,20 @@ use wasmer_types::{
     MemoryIndex, ModuleInfo, Pages, SignatureIndex, TableIndex, TableInitializer, VMOffsets,
 };
 
+#[derive(Hash, Eq, PartialEq, Clone, Copy)]
+struct NotifyLocation {
+    memory_index: u32,
+    address: u32,
+}
+
+struct NotifyWaiter {
+    thread: Thread,
+    notified: bool,
+}
+struct NotifyMap {
+    map: HashMap<NotifyLocation, Vec<NotifyWaiter>>,
+}
+
 /// A WebAssembly instance.
 ///
 /// The type is dynamically-sized. Indeed, the `vmctx` field can
@@ -92,7 +106,7 @@ pub(crate) struct Instance {
     imported_funcrefs: BoxedSlice<FunctionIndex, NonNull<VMCallerCheckedAnyfunc>>,
 
     /// The Hasmap with the Notify for the Notify/wait opcodes
-    conditions: Arc<Mutex<HashMap<(u32, u32), Vec<(Thread, bool)>>>>,
+    conditions: Arc<Mutex<NotifyMap>>,
 
     /// Additional context used by compiled WebAssembly code. This
     /// field is last, and represents a dynamically-sized array that
@@ -793,10 +807,16 @@ impl Instance {
     // because `park_timeout` doesn't gives any information on why it returns
     fn do_wait(&mut self, index: u32, dst: u32, timeout: i64) -> u32 {
         // fetch the notifier
-        let key = (index, dst);
+        let key = NotifyLocation {
+            memory_index: index,
+            address: dst,
+        };
         let mut conds = self.conditions.lock().unwrap();
-        let v = conds.entry(key).or_insert_with(Vec::new);
-        v.push((current(), false));
+        let v = conds.map.entry(key).or_insert_with(Vec::new);
+        v.push(NotifyWaiter {
+            thread: current(),
+            notified: false,
+        });
         drop(conds);
         if timeout < 0 {
             park();
@@ -804,19 +824,19 @@ impl Instance {
             park_timeout(std::time::Duration::from_nanos(timeout as u64));
         }
         let mut conds = self.conditions.lock().unwrap();
-        let v = conds.get_mut(&key).unwrap();
+        let v = conds.map.get_mut(&key).unwrap();
         let id = current().id();
         let mut ret = 0;
         v.retain(|cond| {
-            if cond.0.id() == id {
-                ret = if cond.1 { 0 } else { 2 };
+            if cond.thread.id() == id {
+                ret = if cond.notified { 0 } else { 2 };
                 false
             } else {
                 true
             }
         });
         if v.is_empty() {
-            conds.remove(&key);
+            conds.map.remove(&key);
         }
         ret
     }
@@ -936,14 +956,17 @@ impl Instance {
         //}
 
         // fetch the notifier
-        let key = (memory_index.as_u32(), dst);
+        let key = NotifyLocation {
+            memory_index: memory_index.as_u32(),
+            address: dst,
+        };
         let mut conds = self.conditions.lock().unwrap();
         let mut cnt = 0u32;
-        if let Some(v) = conds.get_mut(&key) {
-            for (t, b) in v {
+        if let Some(v) = conds.map.get_mut(&key) {
+            for waiter in v {
                 if cnt < count {
-                    *b = true; // mark as was waiked up
-                    t.unpark(); // wakeup!
+                    waiter.notified = true; // mark as was waiked up
+                    waiter.thread.unpark(); // wakeup!
                     cnt += 1;
                 }
             }
@@ -964,15 +987,18 @@ impl Instance {
         //}
 
         // fetch the notifier
-        let key = (memory_index.as_u32(), dst);
+        let key = NotifyLocation {
+            memory_index: memory_index.as_u32(),
+            address: dst,
+        };
         let mut conds = self.conditions.lock().unwrap();
         let mut cnt = 0u32;
-        if conds.contains_key(&key) {
-            let v = conds.get_mut(&key).unwrap();
-            for (t, b) in v {
+        if conds.map.contains_key(&key) {
+            let v = conds.map.get_mut(&key).unwrap();
+            for waiter in v {
                 if cnt < count {
-                    *b = true; // mark as was waiked up
-                    t.unpark(); // wakeup!
+                    waiter.notified = true; // mark as was waiked up
+                    waiter.thread.unpark(); // wakeup!
                     cnt += 1;
                 }
             }
@@ -1071,7 +1097,9 @@ impl InstanceHandle {
                 funcrefs,
                 imported_funcrefs,
                 vmctx: VMContext {},
-                conditions: Arc::new(Mutex::new(HashMap::new())),
+                conditions: Arc::new(Mutex::new(NotifyMap {
+                    map: HashMap::new(),
+                })),
             };
 
             let mut instance_handle = allocator.write_instance(instance);

From 4329e39404a414fc429a0d6d24287d365f1c3861 Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Fri, 25 Nov 2022 11:59:01 +0100
Subject: [PATCH 22/24] Added Trap when more the 2^32 waiters are in queue

---
 lib/vm/src/instance/mod.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/lib/vm/src/instance/mod.rs b/lib/vm/src/instance/mod.rs
index c025e85016e..fee32306ba1 100644
--- a/lib/vm/src/instance/mod.rs
+++ b/lib/vm/src/instance/mod.rs
@@ -838,6 +838,9 @@ impl Instance {
         if v.is_empty() {
             conds.map.remove(&key);
         }
+        if conds.map.len() > 1 << 32 {
+            ret = 0xffff;
+        }
         ret
     }
 
@@ -860,6 +863,10 @@ impl Instance {
             if ret == 0 {
                 ret = self.do_wait(memory_index.as_u32(), dst, timeout);
             }
+            if ret == 0xffff {
+                // ret is 0xffff if there is more than 2^32 waiter in queue
+                return Err(Trap::lib(TrapCode::TableAccessOutOfBounds));
+            }
             Ok(ret)
         } else {
             ret
@@ -886,6 +893,10 @@ impl Instance {
             if ret == 0 {
                 ret = self.do_wait(memory_index.as_u32(), dst, timeout);
             }
+            if ret == 0xffff {
+                // ret is 0xffff if there is more than 2^32 waiter in queue
+                return Err(Trap::lib(TrapCode::TableAccessOutOfBounds));
+            }
             Ok(ret)
         } else {
             ret
@@ -911,6 +922,10 @@ impl Instance {
             if ret == 0 {
                 ret = self.do_wait(memory_index.as_u32(), dst, timeout);
             }
+            if ret == 0xffff {
+                // ret is 0xffff if there is more than 2^32 waiter in queue
+                return Err(Trap::lib(TrapCode::TableAccessOutOfBounds));
+            }
             Ok(ret)
         } else {
             ret
@@ -937,6 +952,10 @@ impl Instance {
             if ret == 0 {
                 ret = self.do_wait(memory_index.as_u32(), dst, timeout);
             }
+            if ret == 0xffff {
+                // ret is 0xffff if there is more than 2^32 waiter in queue
+                return Err(Trap::lib(TrapCode::TableAccessOutOfBounds));
+            }
             Ok(ret)
         } else {
             ret

From f26e9ba20222e67f9c79523a14747dd2e216bb62 Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Fri, 25 Nov 2022 12:18:42 +0100
Subject: [PATCH 23/24] Added a Trap for unaligned waiter access

---
 lib/vm/src/vmcontext.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/lib/vm/src/vmcontext.rs b/lib/vm/src/vmcontext.rs
index eb16620096a..f87df89c471 100644
--- a/lib/vm/src/vmcontext.rs
+++ b/lib/vm/src/vmcontext.rs
@@ -381,7 +381,7 @@ pub(crate) unsafe fn memory_fill(
 ///
 /// # Errors
 ///
-/// Returns a `Trap` error if the memory range is out of bounds.
+/// Returns a `Trap` error if the memory range is out of bounds or 32bits unligned.
 ///
 /// # Safety
 /// memory access is unsafe
@@ -395,6 +395,9 @@ pub(crate) unsafe fn memory32_atomic_check32(
     }
 
     let dst = isize::try_from(dst).unwrap();
+    if dst & 0b11 != 0 {
+        return Err(Trap::lib(TrapCode::UnalignedAtomic));
+    }
 
     // Bounds and casts are checked above, by this point we know that
     // everything is safe.
@@ -409,7 +412,7 @@ pub(crate) unsafe fn memory32_atomic_check32(
 ///
 /// # Errors
 ///
-/// Returns a `Trap` error if the memory range is out of bounds.
+/// Returns a `Trap` error if the memory range is out of bounds or 64bits unaligned.
 ///
 /// # Safety
 /// memory access is unsafe
@@ -423,6 +426,9 @@ pub(crate) unsafe fn memory32_atomic_check64(
     }
 
     let dst = isize::try_from(dst).unwrap();
+    if dst & 0b111 != 0 {
+        return Err(Trap::lib(TrapCode::UnalignedAtomic));
+    }
 
     // Bounds and casts are checked above, by this point we know that
     // everything is safe.

From 68b8995355f070261ea3254c20d7693f96d834f4 Mon Sep 17 00:00:00 2001
From: ptitSeb <sebastien.chev@gmail.com>
Date: Fri, 25 Nov 2022 13:06:53 +0100
Subject: [PATCH 24/24] Refactor atomic_notify with a new funciton to avoid
 duplication

---
 lib/vm/src/instance/mod.rs | 43 ++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/lib/vm/src/instance/mod.rs b/lib/vm/src/instance/mod.rs
index fee32306ba1..45224b64ee3 100644
--- a/lib/vm/src/instance/mod.rs
+++ b/lib/vm/src/instance/mod.rs
@@ -962,6 +962,21 @@ impl Instance {
         }
     }
 
+    fn do_notify(&mut self, key: NotifyLocation, count: u32) -> Result<u32, Trap> {
+        let mut conds = self.conditions.lock().unwrap();
+        let mut cnt = 0u32;
+        if let Some(v) = conds.map.get_mut(&key) {
+            for waiter in v {
+                if cnt < count {
+                    waiter.notified = true; // mark as was waiked up
+                    waiter.thread.unpark(); // wakeup!
+                    cnt += 1;
+                }
+            }
+        }
+        Ok(cnt)
+    }
+
     /// Perform an Atomic.Notify
     pub(crate) fn local_memory_notify(
         &mut self,
@@ -979,19 +994,9 @@ impl Instance {
             memory_index: memory_index.as_u32(),
             address: dst,
         };
-        let mut conds = self.conditions.lock().unwrap();
-        let mut cnt = 0u32;
-        if let Some(v) = conds.map.get_mut(&key) {
-            for waiter in v {
-                if cnt < count {
-                    waiter.notified = true; // mark as was waiked up
-                    waiter.thread.unpark(); // wakeup!
-                    cnt += 1;
-                }
-            }
-        }
-        Ok(cnt)
+        self.do_notify(key, count)
     }
+
     /// Perform an Atomic.Notify
     pub(crate) fn imported_memory_notify(
         &mut self,
@@ -1010,19 +1015,7 @@ impl Instance {
             memory_index: memory_index.as_u32(),
             address: dst,
         };
-        let mut conds = self.conditions.lock().unwrap();
-        let mut cnt = 0u32;
-        if conds.map.contains_key(&key) {
-            let v = conds.map.get_mut(&key).unwrap();
-            for waiter in v {
-                if cnt < count {
-                    waiter.notified = true; // mark as was waiked up
-                    waiter.thread.unpark(); // wakeup!
-                    cnt += 1;
-                }
-            }
-        }
-        Ok(cnt)
+        self.do_notify(key, count)
     }
 }