diff --git a/mlkem/native/aarch64/src/intt_clean.S b/mlkem/native/aarch64/src/intt_clean.S
index 01079d7f4..28ad38975 100644
--- a/mlkem/native/aarch64/src/intt_clean.S
+++ b/mlkem/native/aarch64/src/intt_clean.S
@@ -149,7 +149,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -193,40 +193,20 @@
         t3  .req v28
 
         ninv             .req v29
-        q_ninv           .req q29
         ninv_tw          .req v30
-        q_ninv_tw        .req q30
-
-/* Literal pool */
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_consts:         .short 3329
-                  .short 20159
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-c_ninv:           dup8h 512
-c_ninv_tw:        dup8h 5040
 
 MLKEM_ASM_NAMESPACE(intt_asm_clean):
         push_stack
 
-        ldr q_consts,  c_consts
-        ldr q_ninv,    c_ninv
-        ldr q_ninv_tw, c_ninv_tw
+        // Setup constants
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
+        mov wtmp, #512
+        dup ninv.8h, wtmp
+        mov wtmp, #5040
+        dup ninv_tw.8h, wtmp
 
         mov inp, in
         mov count, #8
@@ -367,7 +347,7 @@ layer012_start:
     .unreq r56_ptr
     .unreq inp
     .unreq count
-    .unreq xtmp
+    .unreq wtmp
     .unreq data0
     .unreq data1
     .unreq data2
@@ -404,8 +384,6 @@ layer012_start:
     .unreq t2
     .unreq t3
     .unreq ninv
-    .unreq q_ninv
     .unreq ninv_tw
-    .unreq q_ninv_tw
 
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
diff --git a/mlkem/native/aarch64/src/intt_opt.S b/mlkem/native/aarch64/src/intt_opt.S
index 5c420d9eb..857c729cb 100644
--- a/mlkem/native/aarch64/src/intt_opt.S
+++ b/mlkem/native/aarch64/src/intt_opt.S
@@ -149,7 +149,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -193,40 +193,20 @@
         t3  .req v28
 
         ninv             .req v29
-        q_ninv           .req q29
         ninv_tw          .req v30
-        q_ninv_tw        .req q30
-
-/* Literal pool */
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_consts:         .short 3329
-                  .short 20159
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-                  .short 0
-c_ninv:           dup8h 512
-c_ninv_tw:        dup8h 5040
 
 MLKEM_ASM_NAMESPACE(intt_asm_opt):
         push_stack
 
-        ldr q_consts,  c_consts
-        ldr q_ninv,    c_ninv
-        ldr q_ninv_tw, c_ninv_tw
+        // Setup constants
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
+        mov wtmp, #512
+        dup ninv.8h, wtmp
+        mov wtmp, #5040
+        dup ninv_tw.8h, wtmp
 
         mov inp, in
         mov count, #8
@@ -1023,7 +1003,7 @@ layer012_start:
     .unreq r56_ptr
     .unreq inp
     .unreq count
-    .unreq xtmp
+    .unreq wtmp
     .unreq data0
     .unreq data1
     .unreq data2
@@ -1060,8 +1040,6 @@ layer012_start:
     .unreq t2
     .unreq t3
     .unreq ninv
-    .unreq q_ninv
     .unreq ninv_tw
-    .unreq q_ninv_tw
 
 #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
diff --git a/mlkem/native/aarch64/src/ntt_clean.S b/mlkem/native/aarch64/src/ntt_clean.S
index 14d9bea8e..30fdc76b0 100644
--- a/mlkem/native/aarch64/src/ntt_clean.S
+++ b/mlkem/native/aarch64/src/ntt_clean.S
@@ -121,7 +121,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -156,7 +156,6 @@
         q_root2_tw .req q6
 
         consts    .req v7
-        q_consts  .req q7
 
         tmp .req v24
         t0  .req v25
@@ -167,21 +166,13 @@
         .text
         .global MLKEM_ASM_NAMESPACE(ntt_asm_clean)
 
-/* Literal pool */
-.p2align 4
-c_consts:
-        .short 3329
-        .short 20159
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-
 MLKEM_ASM_NAMESPACE(ntt_asm_clean):
         push_stack
-        ldr q_consts, c_consts
+
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
 
         mov inp, in
         mov count, #4
@@ -286,7 +277,7 @@ layer3456_start:
     .unreq r56_ptr
     .unreq inp
     .unreq count
-    .unreq xtmp
+    .unreq wtmp
     .unreq data0
     .unreq data1
     .unreq data2
@@ -316,7 +307,6 @@ layer3456_start:
     .unreq q_root1_tw
     .unreq q_root2_tw
     .unreq consts
-    .unreq q_consts
     .unreq tmp
     .unreq t0
     .unreq t1
diff --git a/mlkem/native/aarch64/src/ntt_opt.S b/mlkem/native/aarch64/src/ntt_opt.S
index d979d76a0..431f9dc6f 100644
--- a/mlkem/native/aarch64/src/ntt_opt.S
+++ b/mlkem/native/aarch64/src/ntt_opt.S
@@ -121,7 +121,7 @@
 
         inp     .req x3
         count   .req x4
-        xtmp    .req x5
+        wtmp    .req w5
 
         data0  .req v8
         data1  .req v9
@@ -167,21 +167,13 @@
         .text
         .global MLKEM_ASM_NAMESPACE(ntt_asm_opt)
 
-/* Literal pool */
-.p2align 4
-c_consts:
-        .short 3329
-        .short 20159
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-        .short 0
-
 MLKEM_ASM_NAMESPACE(ntt_asm_opt):
         push_stack
-        ldr q_consts, c_consts
+
+        mov wtmp, #3329
+        mov consts.h[0], wtmp
+        mov wtmp, #20159
+        mov consts.h[1], wtmp
 
         mov inp, in
         mov count, #4
@@ -922,7 +914,7 @@ MLKEM_ASM_NAMESPACE(ntt_asm_opt):
     .unreq r56_ptr
     .unreq inp
     .unreq count
-    .unreq xtmp
+    .unreq wtmp
     .unreq data0
     .unreq data1
     .unreq data2
diff --git a/mlkem/native/aarch64/src/poly_clean.S b/mlkem/native/aarch64/src/poly_clean.S
index 071ce2a7b..f3ee0796f 100644
--- a/mlkem/native/aarch64/src/poly_clean.S
+++ b/mlkem/native/aarch64/src/poly_clean.S
@@ -6,33 +6,6 @@
 #include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 20159  // Barrett twist of 1 wrt 2^27
-c_mont_constant:   dup8h -1044  // 2^16 % 3329
-c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
-
 /*
  * Some modular arithmetic macros
  */
@@ -70,6 +43,7 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
 
         ptr               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -77,14 +51,15 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
         tmp               .req v1
         mask              .req v2
         modulus           .req v3
-        q_modulus         .req q3
         modulus_twisted   .req v4
-        q_modulus_twisted .req q4
 
 MLKEM_ASM_NAMESPACE(poly_reduce_asm_clean):
 
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
 
         mov count, #8
 loop_start:
@@ -115,6 +90,7 @@ loop_start:
 
         .unreq ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -122,9 +98,7 @@ loop_start:
         .unreq tmp
         .unreq mask
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *          poly_mulcache_compute()         *
@@ -137,6 +111,7 @@ loop_start:
         zeta_ptr          .req x2
         zeta_twisted_ptr  .req x3
         count             .req x4
+        wtmp              .req w5
 
         data_odd          .req v0
         zeta              .req v1
@@ -152,13 +127,14 @@ loop_start:
         q_dst             .req q5
 
         modulus           .req v6
-        q_modulus         .req q6
         modulus_twisted   .req v7
-        q_modulus_twisted .req q7
 
 MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_clean):
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159
+        dup modulus_twisted.8h, wtmp
 
         mov count, #16
 mulcache_compute_loop_start:
@@ -185,6 +161,7 @@ mulcache_compute_loop_start:
         .unreq zeta_ptr
         .unreq zeta_twisted_ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data_odd
         .unreq zeta
@@ -200,9 +177,7 @@ mulcache_compute_loop_start:
         .unreq q_dst
 
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *             poly_tobytes()               *
@@ -261,6 +236,7 @@ poly_tobytes_asm_clean_asm_loop_start:
 
         src               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -268,22 +244,25 @@ poly_tobytes_asm_clean_asm_loop_start:
         q_res             .req q1
 
         factor            .req v2
-        q_factor          .req q2
         factor_t          .req v3
-        q_factor_t        .req q3
         modulus           .req v4
-        q_modulus         .req q4
         modulus_twisted   .req v5
-        q_modulus_twisted .req q5
 
         tmp0              .req v6
 
 MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean):
 
-        ldr q_modulus,         c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
-        ldr q_factor,          c_mont_constant
-        ldr q_factor_t,        c_barrett_twist
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
+
+        mov wtmp, #-1044 // 2^16 % 3329
+        dup factor.8h, wtmp
+
+        mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
+        dup factor_t.8h, wtmp
 
         mov count, #8
 poly_tomont_asm_loop:
@@ -311,6 +290,7 @@ poly_tomont_asm_loop:
 
         .unreq src
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -318,13 +298,9 @@ poly_tomont_asm_loop:
         .unreq q_res
 
         .unreq factor
-        .unreq q_factor
         .unreq factor_t
-        .unreq q_factor_t
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
         .unreq tmp0
 
diff --git a/mlkem/native/aarch64/src/poly_opt.S b/mlkem/native/aarch64/src/poly_opt.S
index 002d2edec..555c60a67 100644
--- a/mlkem/native/aarch64/src/poly_opt.S
+++ b/mlkem/native/aarch64/src/poly_opt.S
@@ -6,33 +6,6 @@
 #include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 20159  // Barrett twist of 1 wrt 2^27
-c_mont_constant:   dup8h -1044  // 2^16 % 3329
-c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
-
 /*
  * Some modular arithmetic macros
  */
@@ -70,6 +43,7 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
 
         ptr               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -77,14 +51,15 @@ c_barrett_twist:   dup8h -10276 // Barrett twist of -1044 (wrt 2^16)
         tmp               .req v1
         mask              .req v2
         modulus           .req v3
-        q_modulus         .req q3
         modulus_twisted   .req v4
-        q_modulus_twisted .req q4
 
 MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
 
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
 
         mov count, #8
                                                // Instructions:    15
@@ -278,6 +253,7 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
 
         .unreq ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -285,9 +261,7 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
         .unreq tmp
         .unreq mask
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *          poly_mulcache_compute()         *
@@ -300,6 +274,7 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
         zeta_ptr          .req x2
         zeta_twisted_ptr  .req x3
         count             .req x4
+        wtmp              .req w5
 
         data_odd          .req v0
         zeta              .req v1
@@ -315,13 +290,14 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt):
         q_dst             .req q5
 
         modulus           .req v6
-        q_modulus         .req q6
         modulus_twisted   .req v7
-        q_modulus_twisted .req q7
 
 MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159
+        dup modulus_twisted.8h, wtmp
 
         mov count, #16
                                               // Instructions:    7
@@ -426,6 +402,7 @@ MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
         .unreq zeta_ptr
         .unreq zeta_twisted_ptr
         .unreq count
+        .unreq wtmp
 
         .unreq data_odd
         .unreq zeta
@@ -441,9 +418,7 @@ MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt):
         .unreq q_dst
 
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
 /********************************************
  *             poly_tobytes()               *
@@ -502,6 +477,7 @@ poly_tobytes_asm_opt_asm_loop_start:
 
         src               .req x0
         count             .req x1
+        wtmp              .req w2
 
         data              .req v0
         q_data            .req q0
@@ -509,22 +485,25 @@ poly_tobytes_asm_opt_asm_loop_start:
         q_res             .req q1
 
         factor            .req v2
-        q_factor          .req q2
         factor_t          .req v3
-        q_factor_t        .req q3
         modulus           .req v4
-        q_modulus         .req q4
         modulus_twisted   .req v5
-        q_modulus_twisted .req q5
 
         tmp0              .req v6
 
 MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
 
-        ldr q_modulus,         c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
-        ldr q_factor,          c_mont_constant
-        ldr q_factor_t,        c_barrett_twist
+        mov wtmp, #3329 // ML-KEM modulus
+        dup modulus.8h, wtmp
+
+        mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
+        dup modulus_twisted.8h, wtmp
+
+        mov wtmp, #-1044 // 2^16 % 3329
+        dup factor.8h, wtmp
+
+        mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16)
+        dup factor_t.8h, wtmp
 
         mov count, #8
                                              // Instructions:    5
@@ -670,6 +649,7 @@ MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
 
         .unreq src
         .unreq count
+        .unreq wtmp
 
         .unreq data
         .unreq q_data
@@ -677,13 +657,9 @@ MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt):
         .unreq q_res
 
         .unreq factor
-        .unreq q_factor
         .unreq factor_t
-        .unreq q_factor_t
         .unreq modulus
-        .unreq q_modulus
         .unreq modulus_twisted
-        .unreq q_modulus_twisted
 
         .unreq tmp0
 
diff --git a/mlkem/native/aarch64/src/polyvec_clean.S b/mlkem/native/aarch64/src/polyvec_clean.S
index 23f420f94..ef590db9c 100644
--- a/mlkem/native/aarch64/src/polyvec_clean.S
+++ b/mlkem/native/aarch64/src/polyvec_clean.S
@@ -12,31 +12,6 @@
 #include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 3327
-
 // Input:
 // - Vectors al, ah of 32-bit entries
 // Output:
@@ -136,11 +111,10 @@ c_modulus_twisted: dup8h 3327
         b3_ptr       .req x11
         b3_cache_ptr .req x12
         count        .req x13
+        wtmp         .req w14
 
         modulus           .req v0
-        q_modulus         .req q0
         modulus_twisted   .req v2
-        q_modulus_twisted .req q2
 
         aa0      .req v3
         aa1      .req v4
@@ -168,8 +142,12 @@ c_modulus_twisted: dup8h 3327
 
 MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -202,8 +180,11 @@ k2_loop_start:
 
 MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -241,8 +222,11 @@ k3_loop_start:
 
 MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -301,9 +285,7 @@ k4_loop_start:
     .unreq b3_cache_ptr
     .unreq count
     .unreq modulus
-    .unreq q_modulus
     .unreq modulus_twisted
-    .unreq q_modulus_twisted
     .unreq aa0
     .unreq aa1
     .unreq bb0
@@ -312,6 +294,7 @@ k4_loop_start:
     .unreq res0l
     .unreq res1l
     .unreq res0h
+    .unreq wtmp
     .unreq res1h
     .unreq tmp0
     .unreq tmp1
diff --git a/mlkem/native/aarch64/src/polyvec_opt.S b/mlkem/native/aarch64/src/polyvec_opt.S
index c14feee50..efbc609eb 100644
--- a/mlkem/native/aarch64/src/polyvec_opt.S
+++ b/mlkem/native/aarch64/src/polyvec_opt.S
@@ -12,31 +12,6 @@
 #include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
-/* We use a single literal pool for all functions in this file.
- * This is OK even when the file gets expanded through SLOTHY,
- * since PC-relative offets are up to 1MB in AArch64.
- *
- * The use of dup8h to build constant vectors in memory
- * is slightly wasteful and could be avoided with a GPR-load
- * followed by Neon `dup`, but we're ultimately only talking
- * about 64 bytes, so it seems OK.
- */
-
-.macro dup8h c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-        .short \c
-.endm
-
-.p2align 4
-c_modulus:         dup8h 3329   // ML-KEM modulus
-c_modulus_twisted: dup8h 3327
-
 // Input:
 // - Vectors al, ah of 32-bit entries
 // Output:
@@ -136,11 +111,10 @@ c_modulus_twisted: dup8h 3327
         b3_ptr       .req x11
         b3_cache_ptr .req x12
         count        .req x13
+        wtmp         .req w14
 
         modulus           .req v0
-        q_modulus         .req q0
         modulus_twisted   .req v2
-        q_modulus_twisted .req q2
 
         aa0      .req v3
         aa1      .req v4
@@ -168,8 +142,12 @@ c_modulus_twisted: dup8h 3327
 
 MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -534,8 +512,11 @@ MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
 
 MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -1005,8 +986,11 @@ MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
 
 MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
         push_stack
-        ldr q_modulus, c_modulus
-        ldr q_modulus_twisted, c_modulus_twisted
+        mov wtmp, #3329
+        dup modulus.8h, wtmp
+
+        mov wtmp, #3327
+        dup modulus_twisted.8h, wtmp
 
         // Computed bases of vector entries
 
@@ -1597,9 +1581,8 @@ MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt):
     .unreq b3_cache_ptr
     .unreq count
     .unreq modulus
-    .unreq q_modulus
     .unreq modulus_twisted
-    .unreq q_modulus_twisted
+    .unreq wtmp
     .unreq aa0
     .unreq aa1
     .unreq bb0
diff --git a/mlkem/native/aarch64/src/rej_uniform_asm_clean.S b/mlkem/native/aarch64/src/rej_uniform_asm_clean.S
index 0c9f593dc..9158d6c82 100644
--- a/mlkem/native/aarch64/src/rej_uniform_asm_clean.S
+++ b/mlkem/native/aarch64/src/rej_uniform_asm_clean.S
@@ -45,6 +45,7 @@
     len                         .req w4
 
     /* Temporary output on the stack */
+    xtmp                        .req x7
     output_tmp                  .req x7
     output_tmp_base             .req x8
 
@@ -110,20 +111,26 @@
 
     mlkem_q                     .req v30
     bits                        .req v31
-    bits_q                      .req q31
 
 .text
-/* Literal pool */
-.p2align 4
-c_bit_table:
-    .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-
 .align 4
 .global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean)
 MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean):
     push_stack
 
-    ldr  bits_q, c_bit_table
+    // Load 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+    movz xtmp, 0x1
+    movk xtmp, 0x2, lsl 16
+    movk xtmp, 0x4, lsl 32
+    movk xtmp, 0x8, lsl 48
+    mov bits.d[0], xtmp
+
+    movz xtmp, 0x10
+    movk xtmp, 0x20, lsl 16
+    movk xtmp, 0x40, lsl 32
+    movk xtmp, 0x80, lsl 48
+    mov bits.d[1], xtmp
+
     movz tmp, #MLKEM_Q
     dup  mlkem_q.8h, tmp
 
@@ -349,6 +356,7 @@ return:
     .unreq count
     .unreq buf_consumed
     .unreq tmp
+    .unreq xtmp
     .unreq final_copy_count
     .unreq rec_idx_0
     .unreq rec_idx_1
@@ -393,7 +401,6 @@ return:
     .unreq table3q
     .unreq mlkem_q
     .unreq bits
-    .unreq bits_q
 
 #endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) ||
           defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */