Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize/keccak #7038

Merged
merged 7 commits into from
May 16, 2024
Merged

Optimize/keccak #7038

merged 7 commits into from
May 16, 2024

Conversation

LukaszRozmej
Copy link
Member

Changes

  • Optimize Keccak by ~3%

Types of changes

What types of changes does your code introduce?

  • Bugfix (a non-breaking change that fixes an issue)
  • New feature (a non-breaking change that adds functionality)
  • Breaking change (a change that causes existing functionality not to work as expected)
  • Optimization
  • Refactoring
  • Documentation update
  • Build-related changes
  • Other: Description

Testing

Requires testing

  • Yes
  • No

If yes, did you write tests?

  • Yes
  • No

Remarks

Some benchmark runs:

Method Mean Error StdDev Code Size Allocated
From_span_keccak 241.2 ns 4.50 ns 4.82 ns 758 B -
From_span_keccak_fast 232.7 ns 2.04 ns 1.70 ns 758 B -
From_span_keccak 241.5 ns 3.31 ns 2.59 ns 758 B -
From_span_keccak_fast 234.7 ns 1.58 ns 1.40 ns 758 B -
From_span_keccak 236.8 ns 0.40 ns 0.37 ns 758 B -
From_span_keccak_fast 232.8 ns 1.23 ns 1.02 ns 758 B -
From_span_keccak 238.0387 ns 1.0068 ns 0.9417 ns 758 B -
From_span_keccak_fast 230.8994 ns 1.3858 ns 1.2285 ns 758 B -
From_span_keccak 237.1530 ns 1.3545 ns 1.1311 ns 758 B -
From_span_keccak_fast 229.3974 ns 0.4813 ns 0.4502 ns 758 B -
From_span_keccak 238.5586 ns 2.4952 ns 2.2119 ns 758 B -
From_span_keccak_fast 233.5361 ns 4.4838 ns 4.6046 ns 758 B -

@LukaszRozmej LukaszRozmej requested a review from benaadams May 16, 2024 13:55
@Scooletz
Copy link
Contributor

Can you compare asm and provide benches? Would be great to see the difference.

@Scooletz Scooletz self-requested a review May 16, 2024 14:58
Copy link
Contributor

@Scooletz Scooletz left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Benchmarks look legit!

@LukaszRozmej
Copy link
Member Author

sharplab doesn't like static constructors so made it to instance method:

new:

Nethermind.Core.Crypto.KeccakHash.KeccakF(System.Span`1<UInt64>)
    L0000: push r15
    L0002: push r14
    L0004: push r13
    L0006: push r12
    L0008: push rdi
    L0009: push rsi
    L000a: push rbp
    L000b: push rbx
    L000c: sub rsp, 0x198
    L0013: mov [rsp+0x1e0], rcx
    L001b: mov rax, [rdx]
    L001e: mov edx, [rdx+8]
    L0021: cmp edx, 0x18
    L0024: jbe L098f
    L002a: mov rdx, [rax+0xc0]
    L0031: mov [rsp+0xe0], rdx
    L0039: mov r8, [rax+0xb8]
    L0040: mov [rsp+0xe8], r8
    L0048: mov r10, [rax+0xb0]
    L004f: mov [rsp+0xf0], r10
    L0057: mov r9, [rax+0xa8]
    L005e: mov r11, [rax+0xa0]
    L0065: mov rbx, [rax+0x98]
    L006c: mov [rsp+0x108], rbx
    L0074: mov rsi, [rax+0x90]
    L007b: mov [rsp+0x110], rsi
    L0083: mov rdi, [rax+0x88]
    L008a: mov [rsp+0x118], rdi
    L0092: mov rbp, [rax+0x80]
    L0099: mov r14, [rax+0x78]
    L009d: mov r15, [rax+0x70]
    L00a1: mov [rsp+0x128], r15
    L00a9: mov r13, [rax+0x68]
    L00ad: mov [rsp+0x130], r13
    L00b5: mov r12, [rax+0x60]
    L00b9: mov rcx, [rax+0x58]
    L00bd: mov rdx, [rax+0x50]
    L00c1: mov rbx, [rax+0x48]
    L00c5: mov [rsp+0x148], rbx
    L00cd: mov r15, [rax+0x40]
    L00d1: mov [rsp+0x150], r15
    L00d9: mov rbx, [rax+0x38]
    L00dd: mov r8, [rax+0x30]
    L00e1: mov rsi, [rax+0x28]
    L00e5: mov r13, [rax+0x20]
    L00e9: mov [rsp+0x170], r13
    L00f1: mov r13, [rax+0x18]
    L00f5: mov [rsp+0x178], r13
    L00fd: mov r13, [rax+0x10]
    L0101: mov r15, [rax+8]
    L0105: mov r10, [rax]
    L0108: xor edi, edi
    L010a: mov [rsp+0x168], rsi
    L0112: xor rsi, r10
    L0115: xor rsi, rdx
    L0118: mov [rsp+0x120], r14
    L0120: xor rsi, r14
    L0123: mov [rsp+0x100], r11
    L012b: xor rsi, r11
    L012e: mov [rsp+0x188], r15
    L0136: mov r14, r15
    L0139: xor r14, r8
    L013c: mov [rsp+0x138], rcx
    L0144: xor r14, rcx
    L0147: xor r14, rbp
    L014a: mov [rsp+0xf8], r9
    L0152: xor r14, r9
    L0155: mov [rsp+0x180], r13
    L015d: mov [rsp+0x158], rbx
    L0165: mov r9, r13
    L0168: xor r9, rbx
    L016b: xor r9, r12
    L016e: xor r9, [rsp+0x118]
    L0176: xor r9, [rsp+0xf0]
    L017e: mov r13, [rsp+0x178]
    L0186: xor r13, [rsp+0x150]
    L018e: xor r13, [rsp+0x130]
    L0196: xor r13, [rsp+0x110]
    L019e: xor r13, [rsp+0xe8]
    L01a6: mov rcx, [rsp+0x170]
    L01ae: xor rcx, [rsp+0x148]
    L01b6: xor rcx, [rsp+0x128]
    L01be: xor rcx, [rsp+0x108]
    L01c6: xor rcx, [rsp+0xe0]
    L01ce: rorx r11, r14, 0x3f
    L01d4: xor r11, rcx
    L01d7: rorx rbx, r9, 0x3f
    L01dd: xor rbx, rsi
    L01e0: rorx r15, r13, 0x3f
    L01e6: xor r15, r14
    L01e9: rol rcx, 1
    L01ec: xor rcx, r9
    L01ef: rol rsi, 1
    L01f2: xor rsi, r13
    L01f5: xor r10, r11
    L01f8: xor r8, rbx
    L01fb: rorx r14, r8, 0x14
    L0201: xor r12, r15
    L0204: rorx r9, r12, 0x15
    L020a: mov r8, [rsp+0x1e0]
    L0212: mov r12, [r8+8]
    L0216: mov [rsp+0x28], r12
    L021b: mov r13, r12
    L021e: cmp edi, [r13+8]
    L0222: jae L098f
    L0228: mov r8d, edi
    L022b: andn r12, r14, r9
    L0230: xor r12, r10
    L0233: xor r12, [r13+r8*8+0x10]
    L0238: mov [rsp+0xd8], r12
    L0240: mov r13, rcx
    L0243: xor r13, [rsp+0x110]
    L024b: rol r13, 0x15
    L024f: mov r8, rsi
    L0252: xor r8, [rsp+0xe0]
    L025a: rol r8, 0xe
    L025e: andn r12, r13, r8
    L0263: xor r12, r9
    L0266: mov [rsp+0xd0], r12
    L026e: andn r12, r8, r10
    L0273: xor r12, r13
    L0276: mov [rsp+0xc8], r12
    L027e: andn r10, r10, r14
    L0283: xor r10, r8
    L0286: mov [rsp+0xc0], r10
    L028e: mov r8, rcx
    L0291: xor r8, [rsp+0x178]
    L0299: rol r8, 0x1c
    L029d: andn r9, r9, r13
    L02a2: xor r9, r14
    L02a5: mov r14, rsi
    L02a8: xor r14, [rsp+0x148]
    L02b0: rol r14, 0x14
    L02b4: xor rdx, r11
    L02b7: rol rdx, 3
    L02bb: andn r13, r14, rdx
    L02c0: xor r13, r8
    L02c3: mov [rsp+0xb8], r13
    L02cb: xor rbp, rbx
    L02ce: rol rbp, 0x2d
    L02d2: andn r12, rdx, rbp
    L02d7: xor r12, r14
    L02da: mov r13, r15
    L02dd: xor r13, [rsp+0xf0]
    L02e5: rol r13, 0x3d
    L02e9: andn r10, rbp, r13
    L02ee: xor r10, rdx
    L02f1: mov [rsp+0xb0], r10
    L02f9: andn rdx, r13, r8
    L02fe: xor rdx, rbp
    L0301: mov [rsp+0xa8], rdx
    L0309: andn r14, r8, r14
    L030e: xor r14, r13
    L0311: mov [rsp+0xa0], r14
    L0319: mov r8, rbx
    L031c: xor r8, [rsp+0x188]
    L0324: rol r8, 1
    L0327: mov rbp, r15
    L032a: xor rbp, [rsp+0x158]
    L0332: rol rbp, 6
    L0336: mov r13, rcx
    L0339: xor r13, [rsp+0x130]
    L0341: rol r13, 0x19
    L0345: andn rdx, rbp, r13
    L034a: xor rdx, r8
    L034d: mov [rsp+0x98], rdx
    L0355: mov r10, rsi
    L0358: xor r10, [rsp+0x108]
    L0360: rol r10, 8
    L0364: andn rdx, r13, r10
    L0369: xor rdx, rbp
    L036c: mov [rsp+0x90], rdx
    L0374: mov r14, r11
    L0377: xor r14, [rsp+0x100]
    L037f: rol r14, 0x12
    L0383: andn rdx, r10, r14
    L0388: xor rdx, r13
    L038b: mov [rsp+0x88], rdx
    L0393: andn r13, r14, r8
    L0398: xor r13, r10
    L039b: mov [rsp+0x80], r13
    L03a3: andn r10, r8, rbp
    L03a8: xor r10, r14
    L03ab: mov [rsp+0x78], r10
    L03b0: mov r8, rsi
    L03b3: xor r8, [rsp+0x170]
    L03bb: rol r8, 0x1b
    L03bf: mov rbp, r11
    L03c2: xor rbp, [rsp+0x168]
    L03ca: rol rbp, 0x24
    L03ce: mov r14, rbx
    L03d1: xor r14, [rsp+0x138]
    L03d9: rol r14, 0xa
    L03dd: andn r13, rbp, r14
    L03e2: xor r13, r8
    L03e5: mov [rsp+0x70], r13
    L03ea: mov rdx, r15
    L03ed: xor rdx, [rsp+0x118]
    L03f5: rol rdx, 0xf
    L03f9: andn r13, r14, rdx
    L03fe: xor r13, rbp
    L0401: mov [rsp+0x68], r13
    L0406: mov r10, rcx
    L0409: xor r10, [rsp+0xe8]
    L0411: rol r10, 0x38
    L0415: andn r13, rdx, r10
    L041a: xor r13, r14
    L041d: mov [rsp+0x60], r13
    L0422: andn r14, r10, r8
    L0427: xor r14, rdx
    L042a: andn rdx, r8, rbp
    L042f: xor rdx, r10
    L0432: mov [rsp+0x58], rdx
    L0437: xor r15, [rsp+0x180]
    L043f: rorx r8, r15, 2
    L0445: xor rcx, [rsp+0x150]
    L044d: rorx rbp, rcx, 9
    L0453: xor rsi, [rsp+0x128]
    L045b: rol rsi, 0x27
    L045f: mov rcx, rsi
    L0462: andn r10, rbp, rcx
    L0467: xor r10, r8
    L046a: mov [rsp+0x50], r10
    L046f: xor r11, [rsp+0x120]
    L0477: rol r11, 0x29
    L047b: andn rsi, rcx, r11
    L0480: xor rsi, rbp
    L0483: mov [rsp+0x48], rsi
    L0488: xor rbx, [rsp+0xf8]
    L0490: rol rbx, 2
    L0494: andn r15, r11, rbx
    L0499: xor r15, rcx
    L049c: mov [rsp+0x40], r15
    L04a1: andn rcx, rbx, r8
    L04a6: xor rcx, r11
    L04a9: mov [rsp+0x38], rcx
    L04ae: andn r11, r8, rbp
    L04b3: xor r11, rbx
    L04b6: mov [rsp+0x30], r11
    L04bb: mov rbp, r9
    L04be: xor rbp, r12
    L04c1: xor rbp, [rsp+0x90]
    L04c9: xor rbp, [rsp+0x68]
    L04ce: xor rbp, rsi
    L04d1: mov r8, [rsp+0xc0]
    L04d9: xor r8, [rsp+0xa0]
    L04e1: xor r8, [rsp+0x78]
    L04e6: xor r8, rdx
    L04e9: xor r8, r11
    L04ec: rorx rbx, rbp, 0x3f
    L04f2: xor rbx, r8
    L04f5: mov rsi, [rsp+0xd8]
    L04fd: xor rsi, [rsp+0xb8]
    L0505: xor rsi, [rsp+0x98]
    L050d: xor rsi, [rsp+0x70]
    L0512: xor rsi, r10
    L0515: mov r10, [rsp+0xd0]
    L051d: xor r10, [rsp+0xb0]
    L0525: xor r10, [rsp+0x88]
    L052d: xor r10, r13
    L0530: xor r10, r15
    L0533: rorx r13, r10, 0x3f
    L0539: xor r13, rsi
    L053c: mov rdx, [rsp+0xc8]
    L0544: xor rdx, [rsp+0xa8]
    L054c: xor rdx, [rsp+0x80]
    L0554: xor rdx, r14
    L0557: xor rdx, rcx
    L055a: rorx rcx, rdx, 0x3f
    L0560: xor rcx, rbp
    L0563: rol r8, 1
    L0566: xor r8, r10
    L0569: rol rsi, 1
    L056c: xor rsi, rdx
    L056f: mov r10, rcx
    L0572: xor r10, [rsp+0x88]
    L057a: rol r10, 0x2b
    L057e: xor r12, r13
    L0581: rorx rbp, r12, 0x14
    L0587: mov rdx, rbx
    L058a: xor rdx, [rsp+0xd8]
    L0592: mov r12, [rsp+0x28]
    L0597: lea r15d, [rdi+1]
    L059b: cmp r15d, [r12+8]
    L05a0: jae L098f
    L05a6: lea r15d, [rdi+1]
    L05aa: andn r11, rbp, r10
    L05af: xor r11, rdx
    L05b2: xor r11, [r12+r15*8+0x10]
    L05b7: mov [rsp+0x190], r11
    L05bf: xor r14, r8
    L05c2: rol r14, 0x15
    L05c6: andn r15, r10, r14
    L05cb: xor r15, rbp
    L05ce: mov [rsp+0x188], r15
    L05d6: mov r12, rsi
    L05d9: xor r12, [rsp+0x30]
    L05de: rol r12, 0xe
    L05e2: andn r15, r14, r12
    L05e7: xor r15, r10
    L05ea: mov [rsp+0x180], r15
    L05f2: andn r10, r12, rdx
    L05f7: xor r10, r14
    L05fa: mov [rsp+0x178], r10
    L0602: andn rdx, rdx, rbp
    L0607: xor rdx, r12
    L060a: mov [rsp+0x170], rdx
    L0612: mov rbp, r8
    L0615: xor rbp, [rsp+0xc8]
    L061d: rol rbp, 0x1c
    L0621: mov r14, rsi
    L0624: xor r14, [rsp+0xa0]
    L062c: rol r14, 0x14
    L0630: mov r12, rbx
    L0633: xor r12, [rsp+0x98]
    L063b: rol r12, 3
    L063f: andn r15, r14, r12
    L0644: xor r15, rbp
    L0647: mov [rsp+0x168], r15
    L064f: mov r10, r13
    L0652: xor r10, [rsp+0x68]
    L0657: rol r10, 0x2d
    L065b: andn rdx, r12, r10
    L0660: xor rdx, r14
    L0663: mov [rsp+0x160], rdx
    L066b: mov r15, rcx
    L066e: xor r15, [rsp+0x40]
    L0673: rol r15, 0x3d
    L0677: andn r11, r10, r15
    L067c: xor r11, r12
    L067f: mov [rsp+0x158], r11
    L0687: andn r12, r15, rbp
    L068c: xor r12, r10
    L068f: mov [rsp+0x150], r12
    L0697: andn rbp, rbp, r14
    L069c: xor rbp, r15
    L069f: mov [rsp+0x148], rbp
    L06a7: xor r9, r13
    L06aa: rorx rbp, r9, 0x3f
    L06b0: mov r9, rcx
    L06b3: xor r9, [rsp+0xb0]
    L06bb: rol r9, 6
    L06bf: mov r12, r8
    L06c2: xor r12, [rsp+0x80]
    L06ca: rol r12, 0x19
    L06ce: andn r15, r9, r12
    L06d3: xor r15, rbp
    L06d6: mov [rsp+0x140], r15
    L06de: mov r11, rsi
    L06e1: xor r11, [rsp+0x58]
    L06e6: rol r11, 8
    L06ea: andn r10, r12, r11
    L06ef: xor r10, r9
    L06f2: mov [rsp+0x138], r10
    L06fa: mov r14, rbx
    L06fd: xor r14, [rsp+0x50]
    L0702: rol r14, 0x12
    L0706: andn r10, r11, r14
    L070b: xor r12, r10
    L070e: andn r10, r14, rbp
    L0713: xor r10, r11
    L0716: mov [rsp+0x130], r10
    L071e: andn r9, rbp, r9
    L0723: xor r9, r14
    L0726: mov [rsp+0x128], r9
    L072e: mov rbp, rsi
    L0731: xor rbp, [rsp+0xc0]
    L0739: rol rbp, 0x1b
    L073d: mov r11, rbx
    L0740: xor r11, [rsp+0xb8]
    L0748: rol r11, 0x24
    L074c: mov r14, r13
    L074f: xor r14, [rsp+0x90]
    L0757: rol r14, 0xa
    L075b: andn r10, r11, r14
    L0760: xor r10, rbp
    L0763: mov [rsp+0x120], r10
    L076b: mov r9, rcx
    L076e: xor r9, [rsp+0x60]
    L0773: rol r9, 0xf
    L0777: andn r10, r14, r9
    L077c: xor r10, r11
    L077f: mov rdx, r8
    L0782: xor rdx, [rsp+0x38]
    L0787: rol rdx, 0x38
    L078b: andn r15, r9, rdx
    L0790: xor r15, r14
    L0793: andn r14, rdx, rbp
    L0798: xor r14, r9
    L079b: mov r9, r14
    L079e: andn r11, rbp, r11
    L07a3: xor r11, rdx
    L07a6: mov rbp, rcx
    L07a9: xor rbp, [rsp+0xd0]
    L07b1: rol rbp, 0x3e
    L07b5: mov rdx, r8
    L07b8: xor rdx, [rsp+0xa8]
    L07c0: rol rdx, 0x37
    L07c4: mov r14, rsi
    L07c7: xor r14, [rsp+0x78]
    L07cc: rol r14, 0x27
    L07d0: andn rcx, rdx, r14
    L07d5: xor rcx, rbp
    L07d8: mov r8, rbx
    L07db: xor r8, [rsp+0x70]
    L07e0: rol r8, 0x29
    L07e4: andn rbx, r14, r8
    L07e9: xor rbx, rdx
    L07ec: mov rsi, r13
    L07ef: xor rsi, [rsp+0x48]
    L07f4: rol rsi, 2
    L07f8: andn r13, r8, rsi
    L07fd: xor r13, r14
    L0800: andn r14, rsi, rbp
    L0805: xor r8, r14
    L0808: andn rdx, rbp, rdx
    L080d: xor rdx, rsi
    L0810: add edi, 2
    L0813: cmp edi, 0x18
    L0816: mov [rsp+0x118], r15
    L081e: mov [rsp+0x110], r9
    L0826: mov [rsp+0x108], r11
    L082e: mov [rsp+0xf0], r13
    L0836: mov [rsp+0xe8], r8
    L083e: mov [rsp+0xe0], rdx
    L0846: mov rbp, r10
    L0849: mov r9, rbx
    L084c: mov r11, rcx
    L084f: mov rcx, [rsp+0x138]
    L0857: mov rdx, [rsp+0x140]
    L085f: mov rbx, [rsp+0x158]
    L0867: mov rsi, [rsp+0x168]
    L086f: mov r8, [rsp+0x160]
    L0877: mov r10, [rsp+0x190]
    L087f: mov r13, [rsp+0x180]
    L0887: mov r14, [rsp+0x120]
    L088f: mov r15, [rsp+0x188]
    L0897: jl L010a
    L089d: mov rdi, [rsp+0xe0]
    L08a5: mov [rax+0xc0], rdi
    L08ac: mov rdi, [rsp+0xe8]
    L08b4: mov [rax+0xb8], rdi
    L08bb: mov rdi, [rsp+0xf0]
    L08c3: mov [rax+0xb0], rdi
    L08ca: mov [rax+0xa8], r9
    L08d1: mov [rax+0xa0], r11
    L08d8: mov r11, [rsp+0x108]
    L08e0: mov [rax+0x98], r11
    L08e7: mov r9, [rsp+0x110]
    L08ef: mov [rax+0x90], r9
    L08f6: mov r9, [rsp+0x118]
    L08fe: mov [rax+0x88], r9
    L0905: mov [rax+0x80], rbp
    L090c: mov [rax+0x78], r14
    L0910: mov r9, [rsp+0x128]
    L0918: mov [rax+0x70], r9
    L091c: mov r9, [rsp+0x130]
    L0924: mov [rax+0x68], r9
    L0928: mov [rax+0x60], r12
    L092c: mov [rax+0x58], rcx
    L0930: mov [rax+0x50], rdx
    L0934: mov r14, [rsp+0x148]
    L093c: mov [rax+0x48], r14
    L0940: mov rcx, [rsp+0x150]
    L0948: mov [rax+0x40], rcx
    L094c: mov [rax+0x38], rbx
    L0950: mov [rax+0x30], r8
    L0954: mov [rax+0x28], rsi
    L0958: mov rcx, [rsp+0x170]
    L0960: mov [rax+0x20], rcx
    L0964: mov r8, [rsp+0x178]
    L096c: mov [rax+0x18], r8
    L0970: mov [rax+0x10], r13
    L0974: mov [rax+8], r15
    L0978: mov [rax], r10
    L097b: add rsp, 0x198
    L0982: pop rbx
    L0983: pop rbp
    L0984: pop rsi
    L0985: pop rdi
    L0986: pop r12
    L0988: pop r13
    L098a: pop r14
    L098c: pop r15
    L098e: ret
    L098f: call 0x00000170b32d00d4
    L0994: int3

old:

Nethermind.Core.Crypto.KeccakHash.KeccakF(System.Span`1<UInt64>)
    L0000: push r15
    L0002: push r14
    L0004: push r13
    L0006: push r12
    L0008: push rdi
    L0009: push rsi
    L000a: push rbp
    L000b: push rbx
    L000c: sub rsp, 0x198
    L0013: mov [rsp+0x1e0], rcx
    L001b: mov rax, [rdx]
    L001e: mov edx, [rdx+8]
    L0021: cmp edx, 0x18
    L0024: jbe L09ab
    L002a: mov rdx, [rax]
    L002d: mov r8, [rax+8]
    L0031: mov r10, [rax+0x10]
    L0035: mov r9, [rax+0x18]
    L0039: mov r11, [rax+0x20]
    L003d: mov [rsp+0x180], r11
    L0045: mov rbx, [rax+0x28]
    L0049: mov rsi, [rax+0x30]
    L004d: mov rdi, [rax+0x38]
    L0051: mov rbp, [rax+0x40]
    L0055: mov [rsp+0x160], rbp
    L005d: mov r14, [rax+0x48]
    L0061: mov [rsp+0x158], r14
    L0069: mov r15, [rax+0x50]
    L006d: mov r13, [rax+0x58]
    L0071: mov r12, [rax+0x60]
    L0075: mov [rsp+0x140], r12
    L007d: mov rcx, [rax+0x68]
    L0081: mov [rsp+0x138], rcx
    L0089: mov r11, [rax+0x70]
    L008d: mov [rsp+0x130], r11
    L0095: mov r11, [rax+0x78]
    L0099: mov r14, [rax+0x80]
    L00a0: mov rcx, [rax+0x88]
    L00a7: mov [rsp+0x118], rcx
    L00af: mov rbp, [rax+0x90]
    L00b6: mov [rsp+0x110], rbp
    L00be: mov rbp, [rax+0x98]
    L00c5: mov [rsp+0x108], rbp
    L00cd: mov rbp, [rax+0xa0]
    L00d4: mov rcx, [rax+0xa8]
    L00db: mov r12, [rax+0xb0]
    L00e2: mov [rsp+0xf0], r12
    L00ea: mov r12, [rax+0xb8]
    L00f1: mov [rsp+0xe8], r12
    L00f9: mov r12, [rax+0xc0]
    L0100: mov [rsp+0xe0], r12
    L0108: xor r12d, r12d
    L010b: mov [rsp+0x178], rbx
    L0113: xor rbx, rdx
    L0116: xor rbx, r15
    L0119: mov [rsp+0x128], r11
    L0121: xor rbx, r11
    L0124: mov [rsp+0x100], rbp
    L012c: xor rbx, rbp
    L012f: mov [rsp+0x190], r8
    L0137: mov r11, r8
    L013a: xor r11, rsi
    L013d: mov [rsp+0x148], r13
    L0145: xor r11, r13
    L0148: mov [rsp+0x120], r14
    L0150: xor r11, r14
    L0153: mov [rsp+0xf8], rcx
    L015b: xor r11, rcx
    L015e: mov [rsp+0x188], r10
    L0166: mov [rsp+0x168], rdi
    L016e: mov rcx, r10
    L0171: xor rcx, rdi
    L0174: xor rcx, [rsp+0x140]
    L017c: xor rcx, [rsp+0x118]
    L0184: xor rcx, [rsp+0xf0]
    L018c: mov r10, r9
    L018f: xor r10, [rsp+0x160]
    L0197: xor r10, [rsp+0x138]
    L019f: xor r10, [rsp+0x110]
    L01a7: xor r10, [rsp+0xe8]
    L01af: mov r13, [rsp+0x180]
    L01b7: xor r13, [rsp+0x158]
    L01bf: xor r13, [rsp+0x130]
    L01c7: xor r13, [rsp+0x108]
    L01cf: xor r13, [rsp+0xe0]
    L01d7: rorx rbp, r11, 0x3f
    L01dd: xor rbp, r13
    L01e0: rorx rdi, rcx, 0x3f
    L01e6: xor rdi, rbx
    L01e9: rorx r8, r10, 0x3f
    L01ef: xor r8, r11
    L01f2: rol r13, 1
    L01f5: xor rcx, r13
    L01f8: rol rbx, 1
    L01fb: xor rbx, r10
    L01fe: xor rdx, rbp
    L0201: xor rsi, rdi
    L0204: rorx r11, rsi, 0x14
    L020a: mov r10, r8
    L020d: xor r10, [rsp+0x140]
    L0215: rol r10, 0x2b
    L0219: mov rsi, rcx
    L021c: xor rsi, [rsp+0x110]
    L0224: rol rsi, 0x15
    L0228: mov r13, rbx
    L022b: xor r13, [rsp+0xe0]
    L0233: rol r13, 0xe
    L0237: andn r14, r10, rsi
    L023c: xor r14, r11
    L023f: mov [rsp+0xd8], r14
    L0247: andn r14, rsi, r13
    L024c: xor r14, r10
    L024f: mov [rsp+0xd0], r14
    L0257: andn r14, r13, rdx
    L025c: xor r14, rsi
    L025f: mov [rsp+0xc8], r14
    L0267: andn rsi, rdx, r11
    L026c: xor rsi, r13
    L026f: mov [rsp+0xc0], rsi
    L0277: xor r9, rcx
    L027a: mov r13, [rsp+0x1e0]
    L0282: mov r13, [r13+8]
    L0286: mov [rsp+0x28], r13
    L028b: cmp r12d, [r13+8]
    L028f: jae L09ab
    L0295: mov esi, r12d
    L0298: andn r10, r11, r10
    L029d: xor r10, rdx
    L02a0: xor r10, [r13+rsi*8+0x10]
    L02a5: rorx rdx, r9, 0x24
    L02ab: mov r11, rbx
    L02ae: xor r11, [rsp+0x158]
    L02b6: rol r11, 0x14
    L02ba: xor r15, rbp
    L02bd: rol r15, 3
    L02c1: mov rsi, rdi
    L02c4: xor rsi, [rsp+0x120]
    L02cc: rol rsi, 0x2d
    L02d0: mov r13, r8
    L02d3: xor r13, [rsp+0xf0]
    L02db: rol r13, 0x3d
    L02df: andn r9, r11, r15
    L02e4: xor r9, rdx
    L02e7: mov [rsp+0xb8], r9
    L02ef: andn r14, r15, rsi
    L02f4: xor r14, r11
    L02f7: mov [rsp+0xb0], r14
    L02ff: andn r14, rsi, r13
    L0304: xor r14, r15
    L0307: mov [rsp+0xa8], r14
    L030f: andn r15, r13, rdx
    L0314: xor r15, rsi
    L0317: mov [rsp+0xa0], r15
    L031f: andn r11, rdx, r11
    L0324: xor r11, r13
    L0327: mov [rsp+0x98], r11
    L032f: mov rdx, rdi
    L0332: xor rdx, [rsp+0x190]
    L033a: rol rdx, 1
    L033d: mov rsi, r8
    L0340: xor rsi, [rsp+0x168]
    L0348: rol rsi, 6
    L034c: mov r13, rcx
    L034f: xor r13, [rsp+0x138]
    L0357: rol r13, 0x19
    L035b: mov r11, rbx
    L035e: xor r11, [rsp+0x108]
    L0366: rol r11, 8
    L036a: mov r15, rbp
    L036d: xor r15, [rsp+0x100]
    L0375: rol r15, 0x12
    L0379: andn r14, rsi, r13
    L037e: xor r14, rdx
    L0381: mov [rsp+0x90], r14
    L0389: andn r14, r13, r11
    L038e: xor r14, rsi
    L0391: mov [rsp+0x88], r14
    L0399: andn r14, r11, r15
    L039e: xor r14, r13
    L03a1: mov [rsp+0x80], r14
    L03a9: andn r13, r15, rdx
    L03ae: xor r13, r11
    L03b1: mov [rsp+0x78], r13
    L03b6: andn r11, rdx, rsi
    L03bb: xor r11, r15
    L03be: mov [rsp+0x70], r11
    L03c3: mov rdx, rbx
    L03c6: xor rdx, [rsp+0x180]
    L03ce: rol rdx, 0x1b
    L03d2: mov rsi, rbp
    L03d5: xor rsi, [rsp+0x178]
    L03dd: rol rsi, 0x24
    L03e1: mov r15, rdi
    L03e4: xor r15, [rsp+0x148]
    L03ec: rol r15, 0xa
    L03f0: mov r11, r8
    L03f3: xor r11, [rsp+0x118]
    L03fb: rol r11, 0xf
    L03ff: mov r13, rcx
    L0402: xor r13, [rsp+0xe8]
    L040a: rol r13, 0x38
    L040e: andn r14, rsi, r15
    L0413: xor r14, rdx
    L0416: mov [rsp+0x68], r14
    L041b: andn r14, r15, r11
    L0420: xor r14, rsi
    L0423: mov [rsp+0x60], r14
    L0428: andn r14, r11, r13
    L042d: xor r14, r15
    L0430: mov [rsp+0x58], r14
    L0435: andn r15, r13, rdx
    L043a: xor r15, r11
    L043d: andn r11, rdx, rsi
    L0442: xor r11, r13
    L0445: mov [rsp+0x50], r11
    L044a: xor r8, [rsp+0x188]
    L0452: rorx rdx, r8, 2
    L0458: xor rcx, [rsp+0x160]
    L0460: rorx rsi, rcx, 9
    L0466: xor rbx, [rsp+0x130]
    L046e: rol rbx, 0x27
    L0472: mov rcx, rbx
    L0475: xor rbp, [rsp+0x128]
    L047d: rol rbp, 0x29
    L0481: mov rbx, rbp
    L0484: xor rdi, [rsp+0xf8]
    L048c: rorx r13, rdi, 0x3e
    L0492: andn r8, rsi, rcx
    L0497: xor r8, rdx
    L049a: mov [rsp+0x48], r8
    L049f: andn rdi, rcx, rbx
    L04a4: xor rdi, rsi
    L04a7: mov [rsp+0x40], rdi
    L04ac: andn rbp, rbx, r13
    L04b1: xor rbp, rcx
    L04b4: mov [rsp+0x38], rbp
    L04b9: andn rcx, r13, rdx
    L04be: xor rcx, rbx
    L04c1: mov [rsp+0x30], rcx
    L04c6: andn rdx, rdx, rsi
    L04cb: xor rdx, r13
    L04ce: mov rsi, r10
    L04d1: xor rsi, r9
    L04d4: xor rsi, [rsp+0x90]
    L04dc: xor rsi, [rsp+0x68]
    L04e1: xor rsi, r8
    L04e4: mov rbx, [rsp+0xd8]
    L04ec: xor rbx, [rsp+0xb0]
    L04f4: xor rbx, [rsp+0x88]
    L04fc: xor rbx, [rsp+0x60]
    L0501: xor rbx, rdi
    L0504: mov r13, [rsp+0xd0]
    L050c: xor r13, [rsp+0xa8]
    L0514: xor r13, [rsp+0x80]
    L051c: xor r13, r14
    L051f: xor r13, rbp
    L0522: mov rdi, [rsp+0xc8]
    L052a: xor rdi, [rsp+0xa0]
    L0532: xor rdi, [rsp+0x78]
    L0537: xor rdi, r15
    L053a: xor rdi, rcx
    L053d: mov rcx, [rsp+0xc0]
    L0545: xor rcx, [rsp+0x98]
    L054d: xor rcx, [rsp+0x70]
    L0552: xor rcx, r11
    L0555: xor rcx, rdx
    L0558: rorx r14, rbx, 0x3f
    L055e: xor r14, rcx
    L0561: rorx r9, r13, 0x3f
    L0567: xor r9, rsi
    L056a: rorx r8, rdi, 0x3f
    L0570: xor r8, rbx
    L0573: rol rcx, 1
    L0576: xor rcx, r13
    L0579: rorx rbx, rsi, 0x3f
    L057f: xor rbx, rdi
    L0582: mov rsi, r10
    L0585: xor rsi, r14
    L0588: mov r10, r9
    L058b: xor r10, [rsp+0xb0]
    L0593: rol r10, 0x2c
    L0597: mov r13, r8
    L059a: xor r13, [rsp+0x80]
    L05a2: rol r13, 0x2b
    L05a6: xor r15, rcx
    L05a9: rorx rdi, r15, 0x2b
    L05af: xor rdx, rbx
    L05b2: rol rdx, 0xe
    L05b6: mov r15, [rsp+0x28]
    L05bb: lea r11d, [r12+1]
    L05c0: cmp r11d, [r15+8]
    L05c4: jae L09ab
    L05ca: lea r11d, [r12+1]
    L05cf: andn rbp, r10, r13
    L05d4: xor rbp, rsi
    L05d7: xor rbp, [r15+r11*8+0x10]
    L05dc: andn r11, r13, rdi
    L05e1: xor r11, r10
    L05e4: mov [rsp+0x190], r11
    L05ec: andn r15, rdi, rdx
    L05f1: xor r15, r13
    L05f4: mov [rsp+0x188], r15
    L05fc: andn r13, rdx, rsi
    L0601: xor r13, rdi
    L0604: andn r10, rsi, r10
    L0609: xor r10, rdx
    L060c: mov [rsp+0x180], r10
    L0614: mov rsi, rcx
    L0617: xor rsi, [rsp+0xc8]
    L061f: rol rsi, 0x1c
    L0623: mov rdx, rbx
    L0626: xor rdx, [rsp+0x98]
    L062e: rol rdx, 0x14
    L0632: mov rdi, r14
    L0635: xor rdi, [rsp+0x90]
    L063d: rol rdi, 3
    L0641: mov r10, r9
    L0644: xor r10, [rsp+0x60]
    L0649: rol r10, 0x2d
    L064d: mov r15, r8
    L0650: xor r15, [rsp+0x38]
    L0655: rol r15, 0x3d
    L0659: andn r11, rdx, rdi
    L065e: xor r11, rsi
    L0661: mov [rsp+0x178], r11
    L0669: andn r11, rdi, r10
    L066e: xor r11, rdx
    L0671: mov [rsp+0x170], r11
    L0679: andn r11, r10, r15
    L067e: xor rdi, r11
    L0681: mov [rsp+0x168], rdi
    L0689: andn r11, r15, rsi
    L068e: xor r11, r10
    L0691: mov [rsp+0x160], r11
    L0699: andn rdx, rsi, rdx
    L069e: xor rdx, r15
    L06a1: mov [rsp+0x158], rdx
    L06a9: mov rsi, r9
    L06ac: xor rsi, [rsp+0xd8]
    L06b4: rol rsi, 1
    L06b7: mov r10, r8
    L06ba: xor r10, [rsp+0xa8]
    L06c2: rol r10, 6
    L06c6: mov r15, rcx
    L06c9: xor r15, [rsp+0x78]
    L06ce: rol r15, 0x19
    L06d2: mov rdx, rbx
    L06d5: xor rdx, [rsp+0x50]
    L06da: rol rdx, 8
    L06de: mov r11, r14
    L06e1: xor r11, [rsp+0x48]
    L06e6: rol r11, 0x12
    L06ea: andn rdi, r10, r15
    L06ef: xor rdi, rsi
    L06f2: mov [rsp+0x150], rdi
    L06fa: andn rdi, r15, rdx
    L06ff: xor rdi, r10
    L0702: mov [rsp+0x148], rdi
    L070a: andn rdi, rdx, r11
    L070f: xor rdi, r15
    L0712: mov [rsp+0x140], rdi
    L071a: andn r15, r11, rsi
    L071f: xor r15, rdx
    L0722: mov [rsp+0x138], r15
    L072a: andn r10, rsi, r10
    L072f: xor r11, r10
    L0732: mov [rsp+0x130], r11
    L073a: mov rsi, rbx
    L073d: xor rsi, [rsp+0xc0]
    L0745: rol rsi, 0x1b
    L0749: mov r10, r14
    L074c: xor r10, [rsp+0xb8]
    L0754: rol r10, 0x24
    L0758: mov r15, r9
    L075b: xor r15, [rsp+0x88]
    L0763: rol r15, 0xa
    L0767: mov r11, r8
    L076a: xor r11, [rsp+0x58]
    L076f: rol r11, 0xf
    L0773: mov rdx, rcx
    L0776: xor rdx, [rsp+0x30]
    L077b: rol rdx, 0x38
    L077f: andn rdi, r10, r15
    L0784: xor rdi, rsi
    L0787: mov [rsp+0x128], rdi
    L078f: andn rdi, r15, r11
    L0794: xor rdi, r10
    L0797: mov [rsp+0x120], rdi
    L079f: andn rdi, r11, rdx
    L07a4: xor rdi, r15
    L07a7: andn r15, rdx, rsi
    L07ac: xor r15, r11
    L07af: mov r11, r15
    L07b2: andn r10, rsi, r10
    L07b7: xor r10, rdx
    L07ba: mov rsi, r8
    L07bd: xor rsi, [rsp+0xd0]
    L07c5: rol rsi, 0x3e
    L07c9: mov rdx, rcx
    L07cc: xor rdx, [rsp+0xa0]
    L07d4: rol rdx, 0x37
    L07d8: mov r15, rbx
    L07db: xor r15, [rsp+0x70]
    L07e0: rol r15, 0x27
    L07e4: mov rcx, r14
    L07e7: xor rcx, [rsp+0x68]
    L07ec: rol rcx, 0x29
    L07f0: mov r8, r9
    L07f3: xor r8, [rsp+0x40]
    L07f8: rol r8, 2
    L07fc: andn r9, rdx, r15
    L0801: xor r9, rsi
    L0804: andn rbx, r15, rcx
    L0809: xor rbx, rdx
    L080c: andn r14, rcx, r8
    L0811: xor r14, r15
    L0814: andn r15, r8, rsi
    L0819: xor r15, rcx
    L081c: mov rcx, r15
    L081f: andn rdx, rsi, rdx
    L0824: xor rdx, r8
    L0827: add r12d, 2
    L082b: cmp r12d, 0x18
    L082f: mov [rsp+0x118], rdi
    L0837: mov [rsp+0x110], r11
    L083f: mov [rsp+0x108], r10
    L0847: mov [rsp+0xf0], r14
    L084f: mov [rsp+0xe8], rcx
    L0857: mov [rsp+0xe0], rdx
    L085f: mov rcx, rbx
    L0862: mov rdx, rbp
    L0865: mov rbp, r9
    L0868: mov r9, r13
    L086b: mov rbx, [rsp+0x178]
    L0873: mov rsi, [rsp+0x170]
    L087b: mov rdi, [rsp+0x168]
    L0883: mov r8, [rsp+0x190]
    L088b: mov r10, [rsp+0x188]
    L0893: mov r11, [rsp+0x128]
    L089b: mov r13, [rsp+0x148]
    L08a3: mov r14, [rsp+0x120]
    L08ab: mov r15, [rsp+0x150]
    L08b3: jl L010b
    L08b9: mov [rax], rdx
    L08bc: mov [rax+8], r8
    L08c0: mov [rax+0x10], r10
    L08c4: mov [rax+0x18], r9
    L08c8: mov r10, [rsp+0x180]
    L08d0: mov [rax+0x20], r10
    L08d4: mov [rax+0x28], rbx
    L08d8: mov [rax+0x30], rsi
    L08dc: mov [rax+0x38], rdi
    L08e0: mov rdx, [rsp+0x160]
    L08e8: mov [rax+0x40], rdx
    L08ec: mov rdx, [rsp+0x158]
    L08f4: mov [rax+0x48], rdx
    L08f8: mov [rax+0x50], r15
    L08fc: mov [rax+0x58], r13
    L0900: mov rdi, [rsp+0x140]
    L0908: mov [rax+0x60], rdi
    L090c: mov rdx, [rsp+0x138]
    L0914: mov [rax+0x68], rdx
    L0918: mov rdx, [rsp+0x130]
    L0920: mov [rax+0x70], rdx
    L0924: mov [rax+0x78], r11
    L0928: mov [rax+0x80], r14
    L092f: mov rdi, [rsp+0x118]
    L0937: mov [rax+0x88], rdi
    L093e: mov r11, [rsp+0x110]
    L0946: mov [rax+0x90], r11
    L094d: mov r10, [rsp+0x108]
    L0955: mov [rax+0x98], r10
    L095c: mov [rax+0xa0], rbp
    L0963: mov [rax+0xa8], rcx
    L096a: mov r14, [rsp+0xf0]
    L0972: mov [rax+0xb0], r14
    L0979: mov rcx, [rsp+0xe8]
    L0981: mov [rax+0xb8], rcx
    L0988: mov rdx, [rsp+0xe0]
    L0990: mov [rax+0xc0], rdx
    L0997: add rsp, 0x198
    L099e: pop rbx
    L099f: pop rbp
    L09a0: pop rsi
    L09a1: pop rdi
    L09a2: pop r12
    L09a4: pop r13
    L09a6: pop r14
    L09a8: pop r15
    L09aa: ret
    L09ab: call 0x00007ffc2d330da0
    L09b0: int3

@LukaszRozmej LukaszRozmej merged commit 6af7b05 into master May 16, 2024
67 checks passed
@LukaszRozmej LukaszRozmej deleted the optimize/keccak branch May 16, 2024 18:00
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants