diff --git a/mlkem/native/aarch64/src/intt_clean.S b/mlkem/native/aarch64/src/intt_clean.S index 01079d7f4..28ad38975 100644 --- a/mlkem/native/aarch64/src/intt_clean.S +++ b/mlkem/native/aarch64/src/intt_clean.S @@ -149,7 +149,7 @@ inp .req x3 count .req x4 - xtmp .req x5 + wtmp .req w5 data0 .req v8 data1 .req v9 @@ -193,40 +193,20 @@ t3 .req v28 ninv .req v29 - q_ninv .req q29 ninv_tw .req v30 - q_ninv_tw .req q30 - -/* Literal pool */ -.macro dup8h c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c -.endm - -.p2align 4 -c_consts: .short 3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 -c_ninv: dup8h 512 -c_ninv_tw: dup8h 5040 MLKEM_ASM_NAMESPACE(intt_asm_clean): push_stack - ldr q_consts, c_consts - ldr q_ninv, c_ninv - ldr q_ninv_tw, c_ninv_tw + // Setup constants + mov wtmp, #3329 + mov consts.h[0], wtmp + mov wtmp, #20159 + mov consts.h[1], wtmp + mov wtmp, #512 + dup ninv.8h, wtmp + mov wtmp, #5040 + dup ninv_tw.8h, wtmp mov inp, in mov count, #8 @@ -367,7 +347,7 @@ layer012_start: .unreq r56_ptr .unreq inp .unreq count - .unreq xtmp + .unreq wtmp .unreq data0 .unreq data1 .unreq data2 @@ -404,8 +384,6 @@ layer012_start: .unreq t2 .unreq t3 .unreq ninv - .unreq q_ninv .unreq ninv_tw - .unreq q_ninv_tw #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */ diff --git a/mlkem/native/aarch64/src/intt_opt.S b/mlkem/native/aarch64/src/intt_opt.S index 5c420d9eb..857c729cb 100644 --- a/mlkem/native/aarch64/src/intt_opt.S +++ b/mlkem/native/aarch64/src/intt_opt.S @@ -149,7 +149,7 @@ inp .req x3 count .req x4 - xtmp .req x5 + wtmp .req w5 data0 .req v8 data1 .req v9 @@ -193,40 +193,20 @@ t3 .req v28 ninv .req v29 - q_ninv .req q29 ninv_tw .req v30 - q_ninv_tw .req q30 - -/* Literal pool */ -.macro dup8h c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c -.endm - -.p2align 4 -c_consts: .short 3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 -c_ninv: dup8h 512 -c_ninv_tw: dup8h 5040 MLKEM_ASM_NAMESPACE(intt_asm_opt): push_stack - ldr q_consts, c_consts - ldr q_ninv, c_ninv - ldr q_ninv_tw, c_ninv_tw + // Setup constants + mov wtmp, #3329 + mov consts.h[0], wtmp + mov wtmp, #20159 + mov consts.h[1], wtmp + mov wtmp, #512 + dup ninv.8h, wtmp + mov wtmp, #5040 + dup ninv_tw.8h, wtmp mov inp, in mov count, #8 @@ -1023,7 +1003,7 @@ layer012_start: .unreq r56_ptr .unreq inp .unreq count - .unreq xtmp + .unreq wtmp .unreq data0 .unreq data1 .unreq data2 @@ -1060,8 +1040,6 @@ layer012_start: .unreq t2 .unreq t3 .unreq ninv - .unreq q_ninv .unreq ninv_tw - .unreq q_ninv_tw #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/mlkem/native/aarch64/src/ntt_clean.S b/mlkem/native/aarch64/src/ntt_clean.S index 14d9bea8e..30fdc76b0 100644 --- a/mlkem/native/aarch64/src/ntt_clean.S +++ b/mlkem/native/aarch64/src/ntt_clean.S @@ -121,7 +121,7 @@ inp .req x3 count .req x4 - xtmp .req x5 + wtmp .req w5 data0 .req v8 data1 .req v9 @@ -156,7 +156,6 @@ q_root2_tw .req q6 consts .req v7 - q_consts .req q7 tmp .req v24 t0 .req v25 @@ -167,21 +166,13 @@ .text .global MLKEM_ASM_NAMESPACE(ntt_asm_clean) -/* Literal pool */ -.p2align 4 -c_consts: - .short 3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - MLKEM_ASM_NAMESPACE(ntt_asm_clean): push_stack - ldr q_consts, c_consts + + mov wtmp, #3329 + mov consts.h[0], wtmp + mov wtmp, #20159 + mov consts.h[1], wtmp mov inp, in mov count, #4 @@ -286,7 +277,7 @@ layer3456_start: .unreq r56_ptr .unreq inp .unreq count - .unreq xtmp + .unreq wtmp .unreq data0 .unreq data1 .unreq data2 @@ -316,7 +307,6 @@ layer3456_start: .unreq q_root1_tw .unreq q_root2_tw .unreq consts - .unreq q_consts .unreq tmp .unreq t0 .unreq t1 diff --git a/mlkem/native/aarch64/src/ntt_opt.S b/mlkem/native/aarch64/src/ntt_opt.S index d979d76a0..431f9dc6f 100644 --- a/mlkem/native/aarch64/src/ntt_opt.S +++ b/mlkem/native/aarch64/src/ntt_opt.S @@ -121,7 +121,7 @@ inp .req x3 count .req x4 - xtmp .req x5 + wtmp .req w5 data0 .req v8 data1 .req v9 @@ -167,21 +167,13 @@ .text .global MLKEM_ASM_NAMESPACE(ntt_asm_opt) -/* Literal pool */ -.p2align 4 -c_consts: - .short 3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - MLKEM_ASM_NAMESPACE(ntt_asm_opt): push_stack - ldr q_consts, c_consts + + mov wtmp, #3329 + mov consts.h[0], wtmp + mov wtmp, #20159 + mov consts.h[1], wtmp mov inp, in mov count, #4 @@ -922,7 +914,7 @@ MLKEM_ASM_NAMESPACE(ntt_asm_opt): .unreq r56_ptr .unreq inp .unreq count - .unreq xtmp + .unreq wtmp .unreq data0 .unreq data1 .unreq data2 diff --git a/mlkem/native/aarch64/src/poly_clean.S b/mlkem/native/aarch64/src/poly_clean.S index 071ce2a7b..f3ee0796f 100644 --- a/mlkem/native/aarch64/src/poly_clean.S +++ b/mlkem/native/aarch64/src/poly_clean.S @@ -6,33 +6,6 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) -/* We use a single literal pool for all functions in this file. - * This is OK even when the file gets expanded through SLOTHY, - * since PC-relative offets are up to 1MB in AArch64. - * - * The use of dup8h to build constant vectors in memory - * is slightly wasteful and could be avoided with a GPR-load - * followed by Neon `dup`, but we're ultimately only talking - * about 64 bytes, so it seems OK. - */ - -.macro dup8h c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c -.endm - -.p2align 4 -c_modulus: dup8h 3329 // ML-KEM modulus -c_modulus_twisted: dup8h 20159 // Barrett twist of 1 wrt 2^27 -c_mont_constant: dup8h -1044 // 2^16 % 3329 -c_barrett_twist: dup8h -10276 // Barrett twist of -1044 (wrt 2^16) - /* * Some modular arithmetic macros */ @@ -70,6 +43,7 @@ c_barrett_twist: dup8h -10276 // Barrett twist of -1044 (wrt 2^16) ptr .req x0 count .req x1 + wtmp .req w2 data .req v0 q_data .req q0 @@ -77,14 +51,15 @@ c_barrett_twist: dup8h -10276 // Barrett twist of -1044 (wrt 2^16) tmp .req v1 mask .req v2 modulus .req v3 - q_modulus .req q3 modulus_twisted .req v4 - q_modulus_twisted .req q4 MLKEM_ASM_NAMESPACE(poly_reduce_asm_clean): - ldr q_modulus, c_modulus - ldr q_modulus_twisted, c_modulus_twisted + mov wtmp, #3329 // ML-KEM modulus + dup modulus.8h, wtmp + + mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 + dup modulus_twisted.8h, wtmp mov count, #8 loop_start: @@ -115,6 +90,7 @@ loop_start: .unreq ptr .unreq count + .unreq wtmp .unreq data .unreq q_data @@ -122,9 +98,7 @@ loop_start: .unreq tmp .unreq mask .unreq modulus - .unreq q_modulus .unreq modulus_twisted - .unreq q_modulus_twisted /******************************************** * poly_mulcache_compute() * @@ -137,6 +111,7 @@ loop_start: zeta_ptr .req x2 zeta_twisted_ptr .req x3 count .req x4 + wtmp .req w5 data_odd .req v0 zeta .req v1 @@ -152,13 +127,14 @@ loop_start: q_dst .req q5 modulus .req v6 - q_modulus .req q6 modulus_twisted .req v7 - q_modulus_twisted .req q7 MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_clean): - ldr q_modulus, c_modulus - ldr q_modulus_twisted, c_modulus_twisted + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #20159 + dup modulus_twisted.8h, wtmp mov count, #16 mulcache_compute_loop_start: @@ -185,6 +161,7 @@ mulcache_compute_loop_start: .unreq zeta_ptr .unreq zeta_twisted_ptr .unreq count + .unreq wtmp .unreq data_odd .unreq zeta @@ -200,9 +177,7 @@ mulcache_compute_loop_start: .unreq q_dst .unreq modulus - .unreq q_modulus .unreq modulus_twisted - .unreq q_modulus_twisted /******************************************** * poly_tobytes() * @@ -261,6 +236,7 @@ poly_tobytes_asm_clean_asm_loop_start: src .req x0 count .req x1 + wtmp .req w2 data .req v0 q_data .req q0 @@ -268,22 +244,25 @@ poly_tobytes_asm_clean_asm_loop_start: q_res .req q1 factor .req v2 - q_factor .req q2 factor_t .req v3 - q_factor_t .req q3 modulus .req v4 - q_modulus .req q4 modulus_twisted .req v5 - q_modulus_twisted .req q5 tmp0 .req v6 MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean): - ldr q_modulus, c_modulus - ldr q_modulus_twisted, c_modulus_twisted - ldr q_factor, c_mont_constant - ldr q_factor_t, c_barrett_twist + mov wtmp, #3329 // ML-KEM modulus + dup modulus.8h, wtmp + + mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 + dup modulus_twisted.8h, wtmp + + mov wtmp, #-1044 // 2^16 % 3329 + dup factor.8h, wtmp + + mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16) + dup factor_t.8h, wtmp mov count, #8 poly_tomont_asm_loop: @@ -311,6 +290,7 @@ poly_tomont_asm_loop: .unreq src .unreq count + .unreq wtmp .unreq data .unreq q_data @@ -318,13 +298,9 @@ poly_tomont_asm_loop: .unreq q_res .unreq factor - .unreq q_factor .unreq factor_t - .unreq q_factor_t .unreq modulus - .unreq q_modulus .unreq modulus_twisted - .unreq q_modulus_twisted .unreq tmp0 diff --git a/mlkem/native/aarch64/src/poly_opt.S b/mlkem/native/aarch64/src/poly_opt.S index 002d2edec..555c60a67 100644 --- a/mlkem/native/aarch64/src/poly_opt.S +++ b/mlkem/native/aarch64/src/poly_opt.S @@ -6,33 +6,6 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -/* We use a single literal pool for all functions in this file. - * This is OK even when the file gets expanded through SLOTHY, - * since PC-relative offets are up to 1MB in AArch64. - * - * The use of dup8h to build constant vectors in memory - * is slightly wasteful and could be avoided with a GPR-load - * followed by Neon `dup`, but we're ultimately only talking - * about 64 bytes, so it seems OK. - */ - -.macro dup8h c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c -.endm - -.p2align 4 -c_modulus: dup8h 3329 // ML-KEM modulus -c_modulus_twisted: dup8h 20159 // Barrett twist of 1 wrt 2^27 -c_mont_constant: dup8h -1044 // 2^16 % 3329 -c_barrett_twist: dup8h -10276 // Barrett twist of -1044 (wrt 2^16) - /* * Some modular arithmetic macros */ @@ -70,6 +43,7 @@ c_barrett_twist: dup8h -10276 // Barrett twist of -1044 (wrt 2^16) ptr .req x0 count .req x1 + wtmp .req w2 data .req v0 q_data .req q0 @@ -77,14 +51,15 @@ c_barrett_twist: dup8h -10276 // Barrett twist of -1044 (wrt 2^16) tmp .req v1 mask .req v2 modulus .req v3 - q_modulus .req q3 modulus_twisted .req v4 - q_modulus_twisted .req q4 MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt): - ldr q_modulus, c_modulus - ldr q_modulus_twisted, c_modulus_twisted + mov wtmp, #3329 // ML-KEM modulus + dup modulus.8h, wtmp + + mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 + dup modulus_twisted.8h, wtmp mov count, #8 // Instructions: 15 @@ -278,6 +253,7 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt): .unreq ptr .unreq count + .unreq wtmp .unreq data .unreq q_data @@ -285,9 +261,7 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt): .unreq tmp .unreq mask .unreq modulus - .unreq q_modulus .unreq modulus_twisted - .unreq q_modulus_twisted /******************************************** * poly_mulcache_compute() * @@ -300,6 +274,7 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt): zeta_ptr .req x2 zeta_twisted_ptr .req x3 count .req x4 + wtmp .req w5 data_odd .req v0 zeta .req v1 @@ -315,13 +290,14 @@ MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt): q_dst .req q5 modulus .req v6 - q_modulus .req q6 modulus_twisted .req v7 - q_modulus_twisted .req q7 MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt): - ldr q_modulus, c_modulus - ldr q_modulus_twisted, c_modulus_twisted + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #20159 + dup modulus_twisted.8h, wtmp mov count, #16 // Instructions: 7 @@ -426,6 +402,7 @@ MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt): .unreq zeta_ptr .unreq zeta_twisted_ptr .unreq count + .unreq wtmp .unreq data_odd .unreq zeta @@ -441,9 +418,7 @@ MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt): .unreq q_dst .unreq modulus - .unreq q_modulus .unreq modulus_twisted - .unreq q_modulus_twisted /******************************************** * poly_tobytes() * @@ -502,6 +477,7 @@ poly_tobytes_asm_opt_asm_loop_start: src .req x0 count .req x1 + wtmp .req w2 data .req v0 q_data .req q0 @@ -509,22 +485,25 @@ poly_tobytes_asm_opt_asm_loop_start: q_res .req q1 factor .req v2 - q_factor .req q2 factor_t .req v3 - q_factor_t .req q3 modulus .req v4 - q_modulus .req q4 modulus_twisted .req v5 - q_modulus_twisted .req q5 tmp0 .req v6 MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt): - ldr q_modulus, c_modulus - ldr q_modulus_twisted, c_modulus_twisted - ldr q_factor, c_mont_constant - ldr q_factor_t, c_barrett_twist + mov wtmp, #3329 // ML-KEM modulus + dup modulus.8h, wtmp + + mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 + dup modulus_twisted.8h, wtmp + + mov wtmp, #-1044 // 2^16 % 3329 + dup factor.8h, wtmp + + mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16) + dup factor_t.8h, wtmp mov count, #8 // Instructions: 5 @@ -670,6 +649,7 @@ MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt): .unreq src .unreq count + .unreq wtmp .unreq data .unreq q_data @@ -677,13 +657,9 @@ MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt): .unreq q_res .unreq factor - .unreq q_factor .unreq factor_t - .unreq q_factor_t .unreq modulus - .unreq q_modulus .unreq modulus_twisted - .unreq q_modulus_twisted .unreq tmp0 diff --git a/mlkem/native/aarch64/src/polyvec_clean.S b/mlkem/native/aarch64/src/polyvec_clean.S index 23f420f94..ef590db9c 100644 --- a/mlkem/native/aarch64/src/polyvec_clean.S +++ b/mlkem/native/aarch64/src/polyvec_clean.S @@ -12,31 +12,6 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) -/* We use a single literal pool for all functions in this file. - * This is OK even when the file gets expanded through SLOTHY, - * since PC-relative offets are up to 1MB in AArch64. - * - * The use of dup8h to build constant vectors in memory - * is slightly wasteful and could be avoided with a GPR-load - * followed by Neon `dup`, but we're ultimately only talking - * about 64 bytes, so it seems OK. - */ - -.macro dup8h c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c -.endm - -.p2align 4 -c_modulus: dup8h 3329 // ML-KEM modulus -c_modulus_twisted: dup8h 3327 - // Input: // - Vectors al, ah of 32-bit entries // Output: @@ -136,11 +111,10 @@ c_modulus_twisted: dup8h 3327 b3_ptr .req x11 b3_cache_ptr .req x12 count .req x13 + wtmp .req w14 modulus .req v0 - q_modulus .req q0 modulus_twisted .req v2 - q_modulus_twisted .req q2 aa0 .req v3 aa1 .req v4 @@ -168,8 +142,12 @@ c_modulus_twisted: dup8h 3327 MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): push_stack - ldr q_modulus, c_modulus - ldr q_modulus_twisted, c_modulus_twisted + + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #3327 + dup modulus_twisted.8h, wtmp // Computed bases of vector entries @@ -202,8 +180,11 @@ k2_loop_start: MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): push_stack - ldr q_modulus, c_modulus - ldr q_modulus_twisted, c_modulus_twisted + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #3327 + dup modulus_twisted.8h, wtmp // Computed bases of vector entries @@ -241,8 +222,11 @@ k3_loop_start: MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): push_stack - ldr q_modulus, c_modulus - ldr q_modulus_twisted, c_modulus_twisted + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #3327 + dup modulus_twisted.8h, wtmp // Computed bases of vector entries @@ -301,9 +285,7 @@ k4_loop_start: .unreq b3_cache_ptr .unreq count .unreq modulus - .unreq q_modulus .unreq modulus_twisted - .unreq q_modulus_twisted .unreq aa0 .unreq aa1 .unreq bb0 @@ -312,6 +294,7 @@ k4_loop_start: .unreq res0l .unreq res1l .unreq res0h + .unreq wtmp .unreq res1h .unreq tmp0 .unreq tmp1 diff --git a/mlkem/native/aarch64/src/polyvec_opt.S b/mlkem/native/aarch64/src/polyvec_opt.S index c14feee50..efbc609eb 100644 --- a/mlkem/native/aarch64/src/polyvec_opt.S +++ b/mlkem/native/aarch64/src/polyvec_opt.S @@ -12,31 +12,6 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -/* We use a single literal pool for all functions in this file. - * This is OK even when the file gets expanded through SLOTHY, - * since PC-relative offets are up to 1MB in AArch64. - * - * The use of dup8h to build constant vectors in memory - * is slightly wasteful and could be avoided with a GPR-load - * followed by Neon `dup`, but we're ultimately only talking - * about 64 bytes, so it seems OK. - */ - -.macro dup8h c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c - .short \c -.endm - -.p2align 4 -c_modulus: dup8h 3329 // ML-KEM modulus -c_modulus_twisted: dup8h 3327 - // Input: // - Vectors al, ah of 32-bit entries // Output: @@ -136,11 +111,10 @@ c_modulus_twisted: dup8h 3327 b3_ptr .req x11 b3_cache_ptr .req x12 count .req x13 + wtmp .req w14 modulus .req v0 - q_modulus .req q0 modulus_twisted .req v2 - q_modulus_twisted .req q2 aa0 .req v3 aa1 .req v4 @@ -168,8 +142,12 @@ c_modulus_twisted: dup8h 3327 MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): push_stack - ldr q_modulus, c_modulus - ldr q_modulus_twisted, c_modulus_twisted + + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #3327 + dup modulus_twisted.8h, wtmp // Computed bases of vector entries @@ -534,8 +512,11 @@ MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): push_stack - ldr q_modulus, c_modulus - ldr q_modulus_twisted, c_modulus_twisted + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #3327 + dup modulus_twisted.8h, wtmp // Computed bases of vector entries @@ -1005,8 +986,11 @@ MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): push_stack - ldr q_modulus, c_modulus - ldr q_modulus_twisted, c_modulus_twisted + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #3327 + dup modulus_twisted.8h, wtmp // Computed bases of vector entries @@ -1597,9 +1581,8 @@ MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): .unreq b3_cache_ptr .unreq count .unreq modulus - .unreq q_modulus .unreq modulus_twisted - .unreq q_modulus_twisted + .unreq wtmp .unreq aa0 .unreq aa1 .unreq bb0 diff --git a/mlkem/native/aarch64/src/rej_uniform_asm_clean.S b/mlkem/native/aarch64/src/rej_uniform_asm_clean.S index 0c9f593dc..9158d6c82 100644 --- a/mlkem/native/aarch64/src/rej_uniform_asm_clean.S +++ b/mlkem/native/aarch64/src/rej_uniform_asm_clean.S @@ -45,6 +45,7 @@ len .req w4 /* Temporary output on the stack */ + xtmp .req x7 output_tmp .req x7 output_tmp_base .req x8 @@ -110,20 +111,26 @@ mlkem_q .req v30 bits .req v31 - bits_q .req q31 .text -/* Literal pool */ -.p2align 4 -c_bit_table: - .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 - .align 4 .global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean) MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean): push_stack - ldr bits_q, c_bit_table + // Load 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + movz xtmp, 0x1 + movk xtmp, 0x2, lsl 16 + movk xtmp, 0x4, lsl 32 + movk xtmp, 0x8, lsl 48 + mov bits.d[0], xtmp + + movz xtmp, 0x10 + movk xtmp, 0x20, lsl 16 + movk xtmp, 0x40, lsl 32 + movk xtmp, 0x80, lsl 48 + mov bits.d[1], xtmp + movz tmp, #MLKEM_Q dup mlkem_q.8h, tmp @@ -349,6 +356,7 @@ return: .unreq count .unreq buf_consumed .unreq tmp + .unreq xtmp .unreq final_copy_count .unreq rec_idx_0 .unreq rec_idx_1 @@ -393,7 +401,6 @@ return: .unreq table3q .unreq mlkem_q .unreq bits - .unreq bits_q #endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */