Skip to content

Commit

Permalink
Merge pull request #667 from pq-code-package/asm_no_consts
Browse files Browse the repository at this point in the history
AArch64: Remove literal pools from native code
  • Loading branch information
hanno-becker authored Jan 17, 2025
2 parents e4ff720 + 7d15fdb commit c79b97a
Show file tree
Hide file tree
Showing 9 changed files with 141 additions and 278 deletions.
44 changes: 11 additions & 33 deletions mlkem/native/aarch64/src/intt_clean.S
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@

inp .req x3
count .req x4
xtmp .req x5
wtmp .req w5

data0 .req v8
data1 .req v9
Expand Down Expand Up @@ -193,40 +193,20 @@
t3 .req v28

ninv .req v29
q_ninv .req q29
ninv_tw .req v30
q_ninv_tw .req q30

/* Literal pool */
.macro dup8h c
.short \c
.short \c
.short \c
.short \c
.short \c
.short \c
.short \c
.short \c
.endm

.p2align 4
c_consts: .short 3329
.short 20159
.short 0
.short 0
.short 0
.short 0
.short 0
.short 0
c_ninv: dup8h 512
c_ninv_tw: dup8h 5040

MLKEM_ASM_NAMESPACE(intt_asm_clean):
push_stack

ldr q_consts, c_consts
ldr q_ninv, c_ninv
ldr q_ninv_tw, c_ninv_tw
// Setup constants
mov wtmp, #3329
mov consts.h[0], wtmp
mov wtmp, #20159
mov consts.h[1], wtmp
mov wtmp, #512
dup ninv.8h, wtmp
mov wtmp, #5040
dup ninv_tw.8h, wtmp

mov inp, in
mov count, #8
Expand Down Expand Up @@ -367,7 +347,7 @@ layer012_start:
.unreq r56_ptr
.unreq inp
.unreq count
.unreq xtmp
.unreq wtmp
.unreq data0
.unreq data1
.unreq data2
Expand Down Expand Up @@ -404,8 +384,6 @@ layer012_start:
.unreq t2
.unreq t3
.unreq ninv
.unreq q_ninv
.unreq ninv_tw
.unreq q_ninv_tw

#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */
44 changes: 11 additions & 33 deletions mlkem/native/aarch64/src/intt_opt.S
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@

inp .req x3
count .req x4
xtmp .req x5
wtmp .req w5

data0 .req v8
data1 .req v9
Expand Down Expand Up @@ -193,40 +193,20 @@
t3 .req v28

ninv .req v29
q_ninv .req q29
ninv_tw .req v30
q_ninv_tw .req q30

/* Literal pool */
.macro dup8h c
.short \c
.short \c
.short \c
.short \c
.short \c
.short \c
.short \c
.short \c
.endm

.p2align 4
c_consts: .short 3329
.short 20159
.short 0
.short 0
.short 0
.short 0
.short 0
.short 0
c_ninv: dup8h 512
c_ninv_tw: dup8h 5040

MLKEM_ASM_NAMESPACE(intt_asm_opt):
push_stack

ldr q_consts, c_consts
ldr q_ninv, c_ninv
ldr q_ninv_tw, c_ninv_tw
// Setup constants
mov wtmp, #3329
mov consts.h[0], wtmp
mov wtmp, #20159
mov consts.h[1], wtmp
mov wtmp, #512
dup ninv.8h, wtmp
mov wtmp, #5040
dup ninv_tw.8h, wtmp

mov inp, in
mov count, #8
Expand Down Expand Up @@ -1023,7 +1003,7 @@ layer012_start:
.unreq r56_ptr
.unreq inp
.unreq count
.unreq xtmp
.unreq wtmp
.unreq data0
.unreq data1
.unreq data2
Expand Down Expand Up @@ -1060,8 +1040,6 @@ layer012_start:
.unreq t2
.unreq t3
.unreq ninv
.unreq q_ninv
.unreq ninv_tw
.unreq q_ninv_tw

#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */
24 changes: 7 additions & 17 deletions mlkem/native/aarch64/src/ntt_clean.S
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@

inp .req x3
count .req x4
xtmp .req x5
wtmp .req w5

data0 .req v8
data1 .req v9
Expand Down Expand Up @@ -156,7 +156,6 @@
q_root2_tw .req q6

consts .req v7
q_consts .req q7

tmp .req v24
t0 .req v25
Expand All @@ -167,21 +166,13 @@
.text
.global MLKEM_ASM_NAMESPACE(ntt_asm_clean)

/* Literal pool */
.p2align 4
c_consts:
.short 3329
.short 20159
.short 0
.short 0
.short 0
.short 0
.short 0
.short 0

MLKEM_ASM_NAMESPACE(ntt_asm_clean):
push_stack
ldr q_consts, c_consts

mov wtmp, #3329
mov consts.h[0], wtmp
mov wtmp, #20159
mov consts.h[1], wtmp

mov inp, in
mov count, #4
Expand Down Expand Up @@ -286,7 +277,7 @@ layer3456_start:
.unreq r56_ptr
.unreq inp
.unreq count
.unreq xtmp
.unreq wtmp
.unreq data0
.unreq data1
.unreq data2
Expand Down Expand Up @@ -316,7 +307,6 @@ layer3456_start:
.unreq q_root1_tw
.unreq q_root2_tw
.unreq consts
.unreq q_consts
.unreq tmp
.unreq t0
.unreq t1
Expand Down
22 changes: 7 additions & 15 deletions mlkem/native/aarch64/src/ntt_opt.S
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@

inp .req x3
count .req x4
xtmp .req x5
wtmp .req w5

data0 .req v8
data1 .req v9
Expand Down Expand Up @@ -167,21 +167,13 @@
.text
.global MLKEM_ASM_NAMESPACE(ntt_asm_opt)

/* Literal pool */
.p2align 4
c_consts:
.short 3329
.short 20159
.short 0
.short 0
.short 0
.short 0
.short 0
.short 0

MLKEM_ASM_NAMESPACE(ntt_asm_opt):
push_stack
ldr q_consts, c_consts

mov wtmp, #3329
mov consts.h[0], wtmp
mov wtmp, #20159
mov consts.h[1], wtmp

mov inp, in
mov count, #4
Expand Down Expand Up @@ -922,7 +914,7 @@ MLKEM_ASM_NAMESPACE(ntt_asm_opt):
.unreq r56_ptr
.unreq inp
.unreq count
.unreq xtmp
.unreq wtmp
.unreq data0
.unreq data1
.unreq data2
Expand Down
Loading

18 comments on commit c79b97a

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arm Cortex-A76 (Raspberry Pi 5) benchmarks

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 29079 cycles 28986 cycles 1.00
ML-KEM-512 encaps 35432 cycles 35399 cycles 1.00
ML-KEM-512 decaps 45902 cycles 45896 cycles 1.00
ML-KEM-768 keypair 49329 cycles 49364 cycles 1.00
ML-KEM-768 encaps 55607 cycles 55564 cycles 1.00
ML-KEM-768 decaps 70402 cycles 70315 cycles 1.00
ML-KEM-1024 keypair 72008 cycles 71989 cycles 1.00
ML-KEM-1024 encaps 80707 cycles 80746 cycles 1.00
ML-KEM-1024 decaps 100613 cycles 100615 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intel Xeon 4th gen (c7i)

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 13925 cycles 13975 cycles 1.00
ML-KEM-512 encaps 17215 cycles 17236 cycles 1.00
ML-KEM-512 decaps 23096 cycles 23052 cycles 1.00
ML-KEM-768 keypair 22542 cycles 22520 cycles 1.00
ML-KEM-768 encaps 24483 cycles 24524 cycles 1.00
ML-KEM-768 decaps 32421 cycles 32548 cycles 1.00
ML-KEM-1024 keypair 31380 cycles 31383 cycles 1.00
ML-KEM-1024 encaps 34911 cycles 34928 cycles 1.00
ML-KEM-1024 decaps 45746 cycles 45798 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMD EPYC 3rd gen (c6a)

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 18100 cycles 18102 cycles 1.00
ML-KEM-512 encaps 23141 cycles 23015 cycles 1.01
ML-KEM-512 decaps 30241 cycles 30251 cycles 1.00
ML-KEM-768 keypair 31098 cycles 31124 cycles 1.00
ML-KEM-768 encaps 33942 cycles 33998 cycles 1.00
ML-KEM-768 decaps 44557 cycles 44518 cycles 1.00
ML-KEM-1024 keypair 44608 cycles 44599 cycles 1.00
ML-KEM-1024 encaps 49888 cycles 49893 cycles 1.00
ML-KEM-1024 decaps 64385 cycles 64399 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intel Xeon 3rd gen (c6i)

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 20347 cycles 20350 cycles 1.00
ML-KEM-512 encaps 26976 cycles 26953 cycles 1.00
ML-KEM-512 decaps 35748 cycles 35746 cycles 1.00
ML-KEM-768 keypair 34881 cycles 34886 cycles 1.00
ML-KEM-768 encaps 38179 cycles 38182 cycles 1.00
ML-KEM-768 decaps 50961 cycles 50946 cycles 1.00
ML-KEM-1024 keypair 47935 cycles 47950 cycles 1.00
ML-KEM-1024 encaps 54084 cycles 54099 cycles 1.00
ML-KEM-1024 decaps 71646 cycles 71603 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMD EPYC 4th gen (c7a)

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 14908 cycles 14915 cycles 1.00
ML-KEM-512 encaps 19923 cycles 19645 cycles 1.01
ML-KEM-512 decaps 26307 cycles 26297 cycles 1.00
ML-KEM-768 keypair 25605 cycles 25589 cycles 1.00
ML-KEM-768 encaps 28131 cycles 28078 cycles 1.00
ML-KEM-768 decaps 37805 cycles 37792 cycles 1.00
ML-KEM-1024 keypair 35621 cycles 35753 cycles 1.00
ML-KEM-1024 encaps 40961 cycles 40945 cycles 1.00
ML-KEM-1024 decaps 54488 cycles 54417 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intel Xeon 4th gen (c7i) (no-opt)

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 33128 cycles 33236 cycles 1.00
ML-KEM-512 encaps 38683 cycles 38554 cycles 1.00
ML-KEM-512 decaps 50830 cycles 50857 cycles 1.00
ML-KEM-768 keypair 54788 cycles 54878 cycles 1.00
ML-KEM-768 encaps 60679 cycles 60646 cycles 1.00
ML-KEM-768 decaps 75749 cycles 75857 cycles 1.00
ML-KEM-1024 keypair 81909 cycles 81938 cycles 1.00
ML-KEM-1024 encaps 91817 cycles 91771 cycles 1.00
ML-KEM-1024 decaps 111459 cycles 111446 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton3

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 18948 cycles 18959 cycles 1.00
ML-KEM-512 encaps 23572 cycles 23572 cycles 1
ML-KEM-512 decaps 30674 cycles 30660 cycles 1.00
ML-KEM-768 keypair 32309 cycles 32312 cycles 1.00
ML-KEM-768 encaps 35886 cycles 35886 cycles 1
ML-KEM-768 decaps 46027 cycles 46021 cycles 1.00
ML-KEM-1024 keypair 46558 cycles 46634 cycles 1.00
ML-KEM-1024 encaps 52447 cycles 52456 cycles 1.00
ML-KEM-1024 decaps 66212 cycles 66268 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMD EPYC 3rd gen (c6a) (no-opt)

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 43313 cycles 43336 cycles 1.00
ML-KEM-512 encaps 51782 cycles 51844 cycles 1.00
ML-KEM-512 decaps 66992 cycles 67031 cycles 1.00
ML-KEM-768 keypair 71611 cycles 71630 cycles 1.00
ML-KEM-768 encaps 82620 cycles 82693 cycles 1.00
ML-KEM-768 decaps 102871 cycles 103012 cycles 1.00
ML-KEM-1024 keypair 106639 cycles 106602 cycles 1.00
ML-KEM-1024 encaps 121082 cycles 121422 cycles 1.00
ML-KEM-1024 decaps 146923 cycles 146875 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intel Xeon 3rd gen (c6i) (no-opt)

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 51612 cycles 51419 cycles 1.00
ML-KEM-512 encaps 59579 cycles 59547 cycles 1.00
ML-KEM-512 decaps 76542 cycles 76562 cycles 1.00
ML-KEM-768 keypair 84268 cycles 84278 cycles 1.00
ML-KEM-768 encaps 94997 cycles 94991 cycles 1.00
ML-KEM-768 decaps 117185 cycles 117180 cycles 1.00
ML-KEM-1024 keypair 124704 cycles 124782 cycles 1.00
ML-KEM-1024 encaps 138906 cycles 138756 cycles 1.00
ML-KEM-1024 decaps 167614 cycles 167416 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMD EPYC 4th gen (c7a) (no-opt)

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 39543 cycles 39375 cycles 1.00
ML-KEM-512 encaps 45583 cycles 45586 cycles 1.00
ML-KEM-512 decaps 59058 cycles 59050 cycles 1.00
ML-KEM-768 keypair 64522 cycles 64596 cycles 1.00
ML-KEM-768 encaps 72873 cycles 72846 cycles 1.00
ML-KEM-768 decaps 91097 cycles 91403 cycles 1.00
ML-KEM-1024 keypair 95976 cycles 95969 cycles 1.00
ML-KEM-1024 encaps 107138 cycles 107130 cycles 1.00
ML-KEM-1024 decaps 130737 cycles 130669 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arm Cortex-A55 (Snapdragon 888) benchmarks

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 58338 cycles 58327 cycles 1.00
ML-KEM-512 encaps 65756 cycles 65795 cycles 1.00
ML-KEM-512 decaps 84486 cycles 84608 cycles 1.00
ML-KEM-768 keypair 99005 cycles 99034 cycles 1.00
ML-KEM-768 encaps 110280 cycles 110325 cycles 1.00
ML-KEM-768 decaps 137289 cycles 137025 cycles 1.00
ML-KEM-1024 keypair 150113 cycles 150344 cycles 1.00
ML-KEM-1024 encaps 166938 cycles 166740 cycles 1.00
ML-KEM-1024 decaps 202856 cycles 202805 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton3 (no-opt)

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 39366 cycles 39365 cycles 1.00
ML-KEM-512 encaps 45437 cycles 45441 cycles 1.00
ML-KEM-512 decaps 57494 cycles 57491 cycles 1.00
ML-KEM-768 keypair 65834 cycles 65827 cycles 1.00
ML-KEM-768 encaps 73815 cycles 73817 cycles 1.00
ML-KEM-768 decaps 89871 cycles 89874 cycles 1.00
ML-KEM-1024 keypair 98964 cycles 98958 cycles 1.00
ML-KEM-1024 encaps 110065 cycles 110050 cycles 1.00
ML-KEM-1024 decaps 130844 cycles 130832 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton4

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 18126 cycles 18116 cycles 1.00
ML-KEM-512 encaps 22180 cycles 22178 cycles 1.00
ML-KEM-512 decaps 28844 cycles 28839 cycles 1.00
ML-KEM-768 keypair 30548 cycles 30560 cycles 1.00
ML-KEM-768 encaps 33640 cycles 33637 cycles 1.00
ML-KEM-768 decaps 43162 cycles 43158 cycles 1.00
ML-KEM-1024 keypair 44158 cycles 44163 cycles 1.00
ML-KEM-1024 encaps 49641 cycles 49653 cycles 1.00
ML-KEM-1024 decaps 62632 cycles 62642 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton2

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 29078 cycles 28986 cycles 1.00
ML-KEM-512 encaps 35454 cycles 35425 cycles 1.00
ML-KEM-512 decaps 45895 cycles 45888 cycles 1.00
ML-KEM-768 keypair 49336 cycles 49378 cycles 1.00
ML-KEM-768 encaps 55606 cycles 55565 cycles 1.00
ML-KEM-768 decaps 70377 cycles 70311 cycles 1.00
ML-KEM-1024 keypair 71998 cycles 71969 cycles 1.00
ML-KEM-1024 encaps 80732 cycles 80763 cycles 1.00
ML-KEM-1024 decaps 100636 cycles 100630 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton4 (no-opt)

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 38041 cycles 38042 cycles 1.00
ML-KEM-512 encaps 43378 cycles 43379 cycles 1.00
ML-KEM-512 decaps 55554 cycles 55553 cycles 1.00
ML-KEM-768 keypair 63001 cycles 63014 cycles 1.00
ML-KEM-768 encaps 70322 cycles 70323 cycles 1.00
ML-KEM-768 decaps 86775 cycles 86776 cycles 1.00
ML-KEM-1024 keypair 94472 cycles 94468 cycles 1.00
ML-KEM-1024 encaps 105172 cycles 105175 cycles 1.00
ML-KEM-1024 decaps 126798 cycles 126797 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton2 (no-opt)

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 60689 cycles 60686 cycles 1.00
ML-KEM-512 encaps 69815 cycles 69803 cycles 1.00
ML-KEM-512 decaps 88748 cycles 88744 cycles 1.00
ML-KEM-768 keypair 101838 cycles 101761 cycles 1.00
ML-KEM-768 encaps 113975 cycles 113894 cycles 1.00
ML-KEM-768 decaps 139410 cycles 139325 cycles 1.00
ML-KEM-1024 keypair 154152 cycles 154153 cycles 1.00
ML-KEM-1024 encaps 169912 cycles 169847 cycles 1.00
ML-KEM-1024 decaps 202254 cycles 202209 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bananapi bpi-f3 benchmarks

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 334930 cycles 334942 cycles 1.00
ML-KEM-512 encaps 443708 cycles 443716 cycles 1.00
ML-KEM-512 decaps 591742 cycles 591749 cycles 1.00
ML-KEM-768 keypair 559197 cycles 559270 cycles 1.00
ML-KEM-768 encaps 697584 cycles 697687 cycles 1.00
ML-KEM-768 decaps 889069 cycles 890201 cycles 1.00
ML-KEM-1024 keypair 828125 cycles 828159 cycles 1.00
ML-KEM-1024 encaps 999895 cycles 999913 cycles 1.00
ML-KEM-1024 decaps 1232864 cycles 1232943 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arm Cortex-A72 (Raspberry Pi 4) benchmarks

Benchmark suite Current: c79b97a Previous: e4ff720 Ratio
ML-KEM-512 keypair 51960 cycles 51568 cycles 1.01
ML-KEM-512 encaps 58317 cycles 58005 cycles 1.01
ML-KEM-512 decaps 74893 cycles 74147 cycles 1.01
ML-KEM-768 keypair 88245 cycles 87902 cycles 1.00
ML-KEM-768 encaps 97016 cycles 96043 cycles 1.01
ML-KEM-768 decaps 120155 cycles 119330 cycles 1.01
ML-KEM-1024 keypair 131808 cycles 131867 cycles 1.00
ML-KEM-1024 encaps 144544 cycles 145139 cycles 1.00
ML-KEM-1024 decaps 175428 cycles 176050 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.