diff --git a/docs/algorithms/kem/ml_kem.md b/docs/algorithms/kem/ml_kem.md index 1b9244c41..73b4e0f80 100644 --- a/docs/algorithms/kem/ml_kem.md +++ b/docs/algorithms/kem/ml_kem.md @@ -7,7 +7,7 @@ - **Authors' website**: https://pq-crystals.org/kyber/ and https://csrc.nist.gov/pubs/fips/203 - **Specification version**: ML-KEM. - **Primary Source**: - - **Source**: https://github.com/pq-code-package/mlkem-native/commit/21c0c397f243543a9d4334860d9edb1d4e6a6cda + - **Source**: https://github.com/pq-code-package/mlkem-native/commit/68a82c658399c470624087b52c6d99032114c0b5 - **Implementation license (SPDX-Identifier)**: CC0-1.0 or Apache-2.0 @@ -24,6 +24,8 @@ | Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage?‡ | |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------| | [Primary Source](#primary-source) | ref | All | All | None | True | True | False | +| [Primary Source](#primary-source) | x86\_64 | x86\_64 | Linux,Darwin | AVX2,BMI2,POPCNT | True | True | False | +| [Primary Source](#primary-source) | aarch64 | ARM64\_V8 | Linux,Darwin | None | True | False | False | Are implementations chosen based on runtime CPU feature detection? **Yes**. @@ -34,6 +36,8 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**. | Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage? | |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------| | [Primary Source](#primary-source) | ref | All | All | None | True | True | False | +| [Primary Source](#primary-source) | x86\_64 | x86\_64 | Linux,Darwin | AVX2,BMI2,POPCNT | True | True | False | +| [Primary Source](#primary-source) | aarch64 | ARM64\_V8 | Linux,Darwin | None | True | False | False | Are implementations chosen based on runtime CPU feature detection? **Yes**. @@ -42,6 +46,8 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**. | Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage? | |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------| | [Primary Source](#primary-source) | ref | All | All | None | True | True | False | +| [Primary Source](#primary-source) | x86\_64 | x86\_64 | Linux,Darwin | AVX2,BMI2,POPCNT | True | True | False | +| [Primary Source](#primary-source) | aarch64 | ARM64\_V8 | Linux,Darwin | None | True | False | False | Are implementations chosen based on runtime CPU feature detection? **Yes**. diff --git a/docs/algorithms/kem/ml_kem.yml b/docs/algorithms/kem/ml_kem.yml index b042606ab..82ebeb84a 100644 --- a/docs/algorithms/kem/ml_kem.yml +++ b/docs/algorithms/kem/ml_kem.yml @@ -17,7 +17,7 @@ website: https://pq-crystals.org/kyber/ and https://csrc.nist.gov/pubs/fips/203 nist-round: FIPS203 spec-version: ML-KEM primary-upstream: - source: https://github.com/pq-code-package/mlkem-native/commit/21c0c397f243543a9d4334860d9edb1d4e6a6cda + source: https://github.com/pq-code-package/mlkem-native/commit/68a82c658399c470624087b52c6d99032114c0b5 spdx-license-identifier: CC0-1.0 or Apache-2.0 parameter-sets: - name: ML-KEM-512 @@ -37,6 +37,34 @@ parameter-sets: no-secret-dependent-branching-claimed: true no-secret-dependent-branching-checked-by-valgrind: true large-stack-usage: false + - upstream: primary-upstream + upstream-id: x86_64 + supported-platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt + common-crypto: + - SHA3: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: true + large-stack-usage: false + - upstream: primary-upstream + upstream-id: aarch64 + supported-platforms: + - architecture: ARM64_V8 + operating_systems: + - Linux + - Darwin + common-crypto: + - SHA3: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: false + large-stack-usage: false - name: ML-KEM-768 claimed-nist-level: 3 claimed-security: IND-CCA2 @@ -54,6 +82,34 @@ parameter-sets: no-secret-dependent-branching-claimed: true no-secret-dependent-branching-checked-by-valgrind: true large-stack-usage: false + - upstream: primary-upstream + upstream-id: x86_64 + supported-platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt + common-crypto: + - SHA3: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: true + large-stack-usage: false + - upstream: primary-upstream + upstream-id: aarch64 + supported-platforms: + - architecture: ARM64_V8 + operating_systems: + - Linux + - Darwin + common-crypto: + - SHA3: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: false + large-stack-usage: false - name: ML-KEM-1024 claimed-nist-level: 5 claimed-security: IND-CCA2 @@ -71,3 +127,31 @@ parameter-sets: no-secret-dependent-branching-claimed: true no-secret-dependent-branching-checked-by-valgrind: true large-stack-usage: false + - upstream: primary-upstream + upstream-id: x86_64 + supported-platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt + common-crypto: + - SHA3: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: true + large-stack-usage: false + - upstream: primary-upstream + upstream-id: aarch64 + supported-platforms: + - architecture: ARM64_V8 + operating_systems: + - Linux + - Darwin + common-crypto: + - SHA3: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: false + large-stack-usage: false diff --git a/docs/cbom.json b/docs/cbom.json index a9361e375..48f298f7f 100644 --- a/docs/cbom.json +++ b/docs/cbom.json @@ -2,23 +2,23 @@ "$schema": "https://raw.githubusercontent.com/CycloneDX/specification/1.6/schema/bom-1.6.schema.json", "bomFormat": "CycloneDX", "specVersion": "1.6", - "serialNumber": "urn:uuid:d66add05-17dd-4986-8894-ed47d1e910b6", + "serialNumber": "urn:uuid:11c99519-c4e5-4517-8016-4932140dd322", "version": 1, "metadata": { - "timestamp": "2024-12-09T14:24:28.343759+00:00", + "timestamp": "2025-01-22T14:42:21.903424+00:00", "component": { "type": "library", - "bom-ref": "pkg:github/open-quantum-safe/liboqs@d0d0413dc9fff538296ab86bac492cb4bf54dedb", + "bom-ref": "pkg:github/open-quantum-safe/liboqs@af4928dddde853579f8a16a488cf3e142f177979", "name": "liboqs", - "version": "d0d0413dc9fff538296ab86bac492cb4bf54dedb" + "version": "af4928dddde853579f8a16a488cf3e142f177979" } }, "components": [ { "type": "library", - "bom-ref": "pkg:github/open-quantum-safe/liboqs@d0d0413dc9fff538296ab86bac492cb4bf54dedb", + "bom-ref": "pkg:github/open-quantum-safe/liboqs@af4928dddde853579f8a16a488cf3e142f177979", "name": "liboqs", - "version": "d0d0413dc9fff538296ab86bac492cb4bf54dedb" + "version": "af4928dddde853579f8a16a488cf3e142f177979" }, { "type": "cryptographic-asset", @@ -1060,6 +1060,46 @@ } } }, + { + "type": "cryptographic-asset", + "bom-ref": "alg:ML-KEM-512:x86_64", + "name": "ML-KEM", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "parameterSetIdentifier": "ML-KEM-512", + "primitive": "kem", + "executionEnvironment": "software-plain-ram", + "cryptoFunctions": [ + "keygen", + "encapsulate", + "decapsulate" + ], + "nistQuantumSecurityLevel": 1, + "implementationPlatform": "x86_64" + } + } + }, + { + "type": "cryptographic-asset", + "bom-ref": "alg:ML-KEM-512:armv8-a", + "name": "ML-KEM", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "parameterSetIdentifier": "ML-KEM-512", + "primitive": "kem", + "executionEnvironment": "software-plain-ram", + "cryptoFunctions": [ + "keygen", + "encapsulate", + "decapsulate" + ], + "nistQuantumSecurityLevel": 1, + "implementationPlatform": "armv8-a" + } + } + }, { "type": "cryptographic-asset", "bom-ref": "alg:ML-KEM-768:generic", @@ -1080,6 +1120,46 @@ } } }, + { + "type": "cryptographic-asset", + "bom-ref": "alg:ML-KEM-768:x86_64", + "name": "ML-KEM", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "parameterSetIdentifier": "ML-KEM-768", + "primitive": "kem", + "executionEnvironment": "software-plain-ram", + "cryptoFunctions": [ + "keygen", + "encapsulate", + "decapsulate" + ], + "nistQuantumSecurityLevel": 3, + "implementationPlatform": "x86_64" + } + } + }, + { + "type": "cryptographic-asset", + "bom-ref": "alg:ML-KEM-768:armv8-a", + "name": "ML-KEM", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "parameterSetIdentifier": "ML-KEM-768", + "primitive": "kem", + "executionEnvironment": "software-plain-ram", + "cryptoFunctions": [ + "keygen", + "encapsulate", + "decapsulate" + ], + "nistQuantumSecurityLevel": 3, + "implementationPlatform": "armv8-a" + } + } + }, { "type": "cryptographic-asset", "bom-ref": "alg:ML-KEM-1024:generic", @@ -1100,6 +1180,46 @@ } } }, + { + "type": "cryptographic-asset", + "bom-ref": "alg:ML-KEM-1024:x86_64", + "name": "ML-KEM", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "parameterSetIdentifier": "ML-KEM-1024", + "primitive": "kem", + "executionEnvironment": "software-plain-ram", + "cryptoFunctions": [ + "keygen", + "encapsulate", + "decapsulate" + ], + "nistQuantumSecurityLevel": 5, + "implementationPlatform": "x86_64" + } + } + }, + { + "type": "cryptographic-asset", + "bom-ref": "alg:ML-KEM-1024:armv8-a", + "name": "ML-KEM", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "parameterSetIdentifier": "ML-KEM-1024", + "primitive": "kem", + "executionEnvironment": "software-plain-ram", + "cryptoFunctions": [ + "keygen", + "encapsulate", + "decapsulate" + ], + "nistQuantumSecurityLevel": 5, + "implementationPlatform": "armv8-a" + } + } + }, { "type": "cryptographic-asset", "bom-ref": "alg:sntrup761:generic", @@ -3067,7 +3187,7 @@ ], "dependencies": [ { - "ref": "pkg:github/open-quantum-safe/liboqs@d0d0413dc9fff538296ab86bac492cb4bf54dedb", + "ref": "pkg:github/open-quantum-safe/liboqs@af4928dddde853579f8a16a488cf3e142f177979", "provides": [ "alg:BIKE-L1:x86_64", "alg:BIKE-L3:x86_64", @@ -3121,8 +3241,14 @@ "alg:Kyber1024:x86_64", "alg:Kyber1024:armv8-a", "alg:ML-KEM-512:generic", + "alg:ML-KEM-512:x86_64", + "alg:ML-KEM-512:armv8-a", "alg:ML-KEM-768:generic", + "alg:ML-KEM-768:x86_64", + "alg:ML-KEM-768:armv8-a", "alg:ML-KEM-1024:generic", + "alg:ML-KEM-1024:x86_64", + "alg:ML-KEM-1024:armv8-a", "alg:sntrup761:generic", "alg:sntrup761:x86_64", "alg:cross-rsdp-128-balanced:generic", @@ -3542,18 +3668,54 @@ "alg:sha3" ] }, + { + "ref": "alg:ML-KEM-512:x86_64", + "dependsOn": [ + "alg:sha3" + ] + }, + { + "ref": "alg:ML-KEM-512:armv8-a", + "dependsOn": [ + "alg:sha3" + ] + }, { "ref": "alg:ML-KEM-768:generic", "dependsOn": [ "alg:sha3" ] }, + { + "ref": "alg:ML-KEM-768:x86_64", + "dependsOn": [ + "alg:sha3" + ] + }, + { + "ref": "alg:ML-KEM-768:armv8-a", + "dependsOn": [ + "alg:sha3" + ] + }, { "ref": "alg:ML-KEM-1024:generic", "dependsOn": [ "alg:sha3" ] }, + { + "ref": "alg:ML-KEM-1024:x86_64", + "dependsOn": [ + "alg:sha3" + ] + }, + { + "ref": "alg:ML-KEM-1024:armv8-a", + "dependsOn": [ + "alg:sha3" + ] + }, { "ref": "alg:sntrup761:generic", "dependsOn": [ diff --git a/scripts/copy_from_upstream/copy_from_upstream.py b/scripts/copy_from_upstream/copy_from_upstream.py index 400ecc57a..1959a4b72 100755 --- a/scripts/copy_from_upstream/copy_from_upstream.py +++ b/scripts/copy_from_upstream/copy_from_upstream.py @@ -495,14 +495,24 @@ def handle_implementation(impl, family, scheme, dst_basedir): else: # determine list of files to copy: if 'sources' in i: + preserve_folder_structure = ('preserve_folder_structure' in i['upstream']) and i['upstream']['preserve_folder_structure'] == True srcs = i['sources'].split(" ") for s in srcs: # Copy recursively only in case of directories not with plain files to avoid copying over symbolic links if os.path.isfile(os.path.join(origfolder, s)): - subprocess.run(['cp', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))]) + if preserve_folder_structure: + subprocess.run(['mkdir', '-p', os.path.join(srcfolder, os.path.dirname(s))]) + subprocess.run(['cp', os.path.join(origfolder, s), os.path.join(srcfolder, s)]) + else: + subprocess.run(['cp', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))]) + else: - subprocess.run( - ['cp', '-r', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))]) + if preserve_folder_structure: + subprocess.run( + ['cp', '-r', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.dirname(s))]) + else: + subprocess.run( + ['cp', '-r', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))]) else: subprocess.run(['cp', '-pr', os.path.join(origfolder, '.'), srcfolder]) # raise Exception("Malformed YML file: No sources listed to copy. Check upstream YML file." ) diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml index be9376220..188f9f937 100644 --- a/scripts/copy_from_upstream/copy_from_upstream.yml +++ b/scripts/copy_from_upstream/copy_from_upstream.yml @@ -33,11 +33,12 @@ upstreams: - name: mlkem-native git_url: https://github.com/pq-code-package/mlkem-native.git - git_branch: main - git_commit: 21c0c397f243543a9d4334860d9edb1d4e6a6cda - kem_meta_path: '{pretty_name_full}_META.yml' + git_branch: updates-8 + git_commit: 68a82c658399c470624087b52c6d99032114c0b5 + kem_meta_path: 'integration/liboqs/{pretty_name_full}_META.yml' kem_scheme_path: '.' patches: [mlkem-native.patch] + preserve_folder_structure: True - name: pqcrystals-dilithium git_url: https://github.com/pq-crystals/dilithium.git diff --git a/scripts/copy_from_upstream/patches/mlkem-native.patch b/scripts/copy_from_upstream/patches/mlkem-native.patch index 290c3f317..17317fc92 100644 --- a/scripts/copy_from_upstream/patches/mlkem-native.patch +++ b/scripts/copy_from_upstream/patches/mlkem-native.patch @@ -1,252 +1,5 @@ -diff --git a/ML-KEM-1024_META.yml b/ML-KEM-1024_META.yml -new file mode 100644 -index 00000000..62b57bdd ---- /dev/null -+++ b/ML-KEM-1024_META.yml -@@ -0,0 +1,63 @@ -+name: ML-KEM-1024 -+type: kem -+claimed-nist-level: 5 -+claimed-security: IND-CCA2 -+length-public-key: 1568 -+length-ciphertext: 1568 -+length-secret-key: 3168 -+length-shared-secret: 32 -+nistkat-sha256: f580d851e5fb27e6876e5e203fa18be4cdbfd49e05d48fec3d3992c8f43a13e6 -+testvectors-sha256: ff1a854b9b6761a70c65ccae85246fe0596a949e72eae0866a8a2a2d4ea54b10 -+principal-submitters: -+ - Peter Schwabe -+auxiliary-submitters: -+ - Roberto Avanzi -+ - Joppe Bos -+ - Léo Ducas -+ - Eike Kiltz -+ - Tancrède Lepoint -+ - Vadim Lyubashevsky -+ - John M. Schanck -+ - Gregor Seiler -+ - Damien Stehlé -+implementations: -+ - name: ref -+ version: FIPS203 -+ folder_name: mlkem -+ compile_opts: -DMLKEM_K=4 -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_C -+ signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair -+ signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc -+ signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_C_dec -+ sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h -+ - name: x86_64 -+ version: FIPS203 -+ folder_name: mlkem -+ compile_opts: -DMLKEM_K=4 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_DEFAULT -+ signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_DEFAULT_keypair -+ signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_DEFAULT_enc -+ signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_DEFAULT_dec -+ sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h native/x86_64 -+ supported_platforms: -+ - architecture: x86_64 -+ operating_systems: -+ - Linux -+ - Darwin -+ required_flags: -+ - avx2 -+ - bmi2 -+ - popcnt -+ - name: aarch64 -+ version: FIPS203 -+ folder_name: mlkem -+ compile_opts: -DMLKEM_K=4 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_OPT -+ signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_OPT_keypair -+ signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_OPT_enc -+ signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_OPT_dec -+ sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h native/aarch64 -+ supported_platforms: -+ - architecture: arm_8 -+ operating_systems: -+ - Linux -+ - Darwin -+ required_flags: -+ - asimd -\ No newline at end of file -diff --git a/ML-KEM-512_META.yml b/ML-KEM-512_META.yml -new file mode 100644 -index 00000000..242503cf ---- /dev/null -+++ b/ML-KEM-512_META.yml -@@ -0,0 +1,63 @@ -+name: ML-KEM-512 -+type: kem -+claimed-nist-level: 1 -+claimed-security: IND-CCA2 -+length-public-key: 800 -+length-ciphertext: 768 -+length-secret-key: 1632 -+length-shared-secret: 32 -+nistkat-sha256: c70041a761e01cd6426fa60e9fd6a4412c2be817386c8d0f3334898082512782 -+testvectors-sha256: 6730bb552c22d9d2176ffb5568e48eb30952cf1f065073ec5f9724f6a3c6ea85 -+principal-submitters: -+ - Peter Schwabe -+auxiliary-submitters: -+ - Roberto Avanzi -+ - Joppe Bos -+ - Léo Ducas -+ - Eike Kiltz -+ - Tancrède Lepoint -+ - Vadim Lyubashevsky -+ - John M. Schanck -+ - Gregor Seiler -+ - Damien Stehlé -+implementations: -+ - name: ref -+ version: FIPS203 -+ folder_name: mlkem -+ compile_opts: -DMLKEM_K=2 -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_C -+ signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair -+ signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_C_enc -+ signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_C_dec -+ sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h -+ - name: x86_64 -+ version: FIPS203 -+ folder_name: mlkem -+ compile_opts: -DMLKEM_K=2 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_X86_64_DEFAULT -+ signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_DEFAULT_keypair -+ signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_DEFAULT_enc -+ signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_DEFAULT_dec -+ sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h native/x86_64 -+ supported_platforms: -+ - architecture: x86_64 -+ operating_systems: -+ - Linux -+ - Darwin -+ required_flags: -+ - avx2 -+ - bmi2 -+ - popcnt -+ - name: aarch64 -+ version: FIPS203 -+ folder_name: mlkem -+ compile_opts: -DMLKEM_K=2 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT -+ signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT_keypair -+ signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT_enc -+ signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT_dec -+ sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h native/aarch64 -+ supported_platforms: -+ - architecture: arm_8 -+ operating_systems: -+ - Linux -+ - Darwin -+ required_flags: -+ - asimd -\ No newline at end of file -diff --git a/ML-KEM-768_META.yml b/ML-KEM-768_META.yml -new file mode 100644 -index 00000000..74e23d9a ---- /dev/null -+++ b/ML-KEM-768_META.yml -@@ -0,0 +1,63 @@ -+name: ML-KEM-768 -+type: kem -+claimed-nist-level: 3 -+claimed-security: IND-CCA2 -+length-public-key: 1184 -+length-ciphertext: 1088 -+length-secret-key: 2400 -+length-shared-secret: 32 -+nistkat-sha256: 5352539586b6c3df58be6158a6250aeff402bd73060b0a3de68850ac074c17c3 -+testvectors-sha256: 667c8ca2ca93729c0df6ff24588460bad1bbdbfb64ece0fe8563852a7ff348c6 -+principal-submitters: -+ - Peter Schwabe -+auxiliary-submitters: -+ - Roberto Avanzi -+ - Joppe Bos -+ - Léo Ducas -+ - Eike Kiltz -+ - Tancrède Lepoint -+ - Vadim Lyubashevsky -+ - John M. Schanck -+ - Gregor Seiler -+ - Damien Stehlé -+implementations: -+ - name: ref -+ version: FIPS203 -+ folder_name: mlkem -+ compile_opts: -DMLKEM_K=3 -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_C -+ signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair -+ signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_C_enc -+ signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_C_dec -+ sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h -+ - name: x86_64 -+ version: FIPS203 -+ folder_name: mlkem -+ compile_opts: -DMLKEM_K=3 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_X86_64_DEFAULT -+ signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_DEFAULT_keypair -+ signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_DEFAULT_enc -+ signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_DEFAULT_dec -+ sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h native/x86_64 -+ supported_platforms: -+ - architecture: x86_64 -+ operating_systems: -+ - Linux -+ - Darwin -+ required_flags: -+ - avx2 -+ - bmi2 -+ - popcnt -+ - name: aarch64 -+ version: FIPS203 -+ folder_name: mlkem -+ compile_opts: -DMLKEM_K=3 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_OPT -+ signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_OPT_keypair -+ signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_OPT_enc -+ signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_OPT_dec -+ sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h native/aarch64 -+ supported_platforms: -+ - architecture: arm_8 -+ operating_systems: -+ - Linux -+ - Darwin -+ required_flags: -+ - asimd -\ No newline at end of file -diff --git a/mlkem/arith_backend.h b/mlkem/arith_backend.h -index ade31cda..0543b1bd 100644 ---- a/mlkem/arith_backend.h -+++ b/mlkem/arith_backend.h -@@ -17,7 +17,7 @@ - * Keep this _after_ the inclusion of the backend; otherwise, - * the sanity checks won't have an effect. */ - #if defined(MLKEM_NATIVE_CHECK_APIS) --#include "native/api.h" -+#include "api.h" - #endif - #endif - -diff --git a/mlkem/config.h b/mlkem/config.h -index 24a49709..fa89370c 100644 ---- a/mlkem/config.h -+++ b/mlkem/config.h -@@ -146,7 +146,7 @@ - * - *****************************************************************************/ - #if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND) --#define MLKEM_NATIVE_ARITH_BACKEND "native/default.h" -+#define MLKEM_NATIVE_ARITH_BACKEND "default.h" - #endif /* MLKEM_NATIVE_ARITH_BACKEND */ - - /****************************************************************************** -@@ -159,8 +159,8 @@ - * This can be set using CFLAGS. - * - *****************************************************************************/ --#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_FIPS202_BACKEND) --#define MLKEM_NATIVE_FIPS202_BACKEND "fips202/native/default.h" -+#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND) -+#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h" - #endif /* MLKEM_NATIVE_FIPS202_BACKEND */ - - /************************* Config internals ********************************/ diff --git a/mlkem/indcpa.c b/mlkem/indcpa.c -index 390cc6f2..0cfcc3e9 100644 +index fdca7caf..318d0fc7 100644 --- a/mlkem/indcpa.c +++ b/mlkem/indcpa.c @@ -6,8 +6,8 @@ @@ -258,458 +11,25 @@ index 390cc6f2..0cfcc3e9 100644 +#include "fips202.h" +#include "fips202x4.h" #include "indcpa.h" - #include "ntt.h" #include "poly.h" -diff --git a/mlkem/native/aarch64/clean.h b/mlkem/native/aarch64/clean.h -index f124702a..43a401df 100644 ---- a/mlkem/native/aarch64/clean.h -+++ b/mlkem/native/aarch64/clean.h -@@ -19,6 +19,6 @@ - /* Filename of the C backend implementation. - * This is not inlined here because this header is included in assembly - * files as well. */ --#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/clean_impl.h" -+#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/clean_impl.h" - - #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */ -diff --git a/mlkem/native/aarch64/opt.h b/mlkem/native/aarch64/opt.h -index a7217163..04323c3e 100644 ---- a/mlkem/native/aarch64/opt.h -+++ b/mlkem/native/aarch64/opt.h -@@ -19,6 +19,6 @@ - /* Filename of the C backend implementation. - * This is not inlined here because this header is included in assembly - * files as well. */ --#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/opt_impl.h" -+#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/opt_impl.h" - - #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */ -diff --git a/mlkem/native/aarch64/src/aarch64_zetas.c b/mlkem/native/aarch64/src/aarch64_zetas.c -index b3a6f198..1e189fd9 100644 ---- a/mlkem/native/aarch64/src/aarch64_zetas.c -+++ b/mlkem/native/aarch64/src/aarch64_zetas.c -@@ -8,7 +8,7 @@ - * Do not modify it directly. - */ - --#include "../../../common.h" -+#include "common.h" - - #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ - defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -diff --git a/mlkem/native/aarch64/src/arith_native_aarch64.h b/mlkem/native/aarch64/src/arith_native_aarch64.h -index a784a302..fc4e7dd3 100644 ---- a/mlkem/native/aarch64/src/arith_native_aarch64.h -+++ b/mlkem/native/aarch64/src/arith_native_aarch64.h -@@ -6,7 +6,7 @@ - #define MLKEM_AARCH64_NATIVE_H - - #include --#include "../../../common.h" -+#include "common.h" - - #define aarch64_ntt_zetas_layer01234 \ - MLKEM_NAMESPACE(aarch64_ntt_zetas_layer01234) -diff --git a/mlkem/native/aarch64/src/clean_impl.h b/mlkem/native/aarch64/src/clean_impl.h -index 805adef1..548b1eeb 100644 ---- a/mlkem/native/aarch64/src/clean_impl.h -+++ b/mlkem/native/aarch64/src/clean_impl.h -@@ -12,8 +12,8 @@ - - #include "arith_native_aarch64.h" - --#include "../../../poly.h" --#include "../../../polyvec.h" -+#include "poly.h" -+#include "polyvec.h" - - /* Set of primitives that this backend replaces */ - #define MLKEM_USE_NATIVE_NTT -diff --git a/mlkem/native/aarch64/src/consts.h b/mlkem/native/aarch64/src/consts.h -index e3ea26a2..c4094729 100644 ---- a/mlkem/native/aarch64/src/consts.h -+++ b/mlkem/native/aarch64/src/consts.h -@@ -7,7 +7,7 @@ - #define MLKEM_NATIVE_AARCH64_CONSTS - - #include --#include "../../../common.h" -+#include "common.h" - - #define zetas_mulcache_native MLKEM_NAMESPACE(zetas_mulcache_native) - extern const int16_t zetas_mulcache_native[256]; -diff --git a/mlkem/native/aarch64/src/intt_clean.S b/mlkem/native/aarch64/src/intt_clean.S -index 28ad3897..b243a569 100644 ---- a/mlkem/native/aarch64/src/intt_clean.S -+++ b/mlkem/native/aarch64/src/intt_clean.S -@@ -23,7 +23,7 @@ - /// SOFTWARE. - /// - --#include "../../../common.h" -+#include "common.h" - #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) - - // Bounds: -diff --git a/mlkem/native/aarch64/src/intt_opt.S b/mlkem/native/aarch64/src/intt_opt.S -index 857c729c..c94746e1 100644 ---- a/mlkem/native/aarch64/src/intt_opt.S -+++ b/mlkem/native/aarch64/src/intt_opt.S -@@ -23,7 +23,7 @@ - /// SOFTWARE. - /// - --#include "../../../common.h" -+#include "common.h" - #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) - - // Bounds: -diff --git a/mlkem/native/aarch64/src/ntt_clean.S b/mlkem/native/aarch64/src/ntt_clean.S -index 30fdc76b..cd63cc4d 100644 ---- a/mlkem/native/aarch64/src/ntt_clean.S -+++ b/mlkem/native/aarch64/src/ntt_clean.S -@@ -24,7 +24,7 @@ - /// SOFTWARE. - /// - --#include "../../../common.h" -+#include "common.h" - #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) - - // Bounds: -diff --git a/mlkem/native/aarch64/src/ntt_opt.S b/mlkem/native/aarch64/src/ntt_opt.S -index 431f9dc6..8705615b 100644 ---- a/mlkem/native/aarch64/src/ntt_opt.S -+++ b/mlkem/native/aarch64/src/ntt_opt.S -@@ -24,7 +24,7 @@ - /// SOFTWARE. - /// - --#include "../../../common.h" -+#include "common.h" - #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) - - // Bounds: -diff --git a/mlkem/native/aarch64/src/opt_impl.h b/mlkem/native/aarch64/src/opt_impl.h -index b92f3adf..ec1bf658 100644 ---- a/mlkem/native/aarch64/src/opt_impl.h -+++ b/mlkem/native/aarch64/src/opt_impl.h -@@ -12,8 +12,8 @@ - - #include "arith_native_aarch64.h" - --#include "../../../poly.h" --#include "../../../polyvec.h" -+#include "poly.h" -+#include "polyvec.h" - - /* Set of primitives that this backend replaces */ - #define MLKEM_USE_NATIVE_NTT -diff --git a/mlkem/native/aarch64/src/poly_clean.S b/mlkem/native/aarch64/src/poly_clean.S -index f3ee0796..809f9667 100644 ---- a/mlkem/native/aarch64/src/poly_clean.S -+++ b/mlkem/native/aarch64/src/poly_clean.S -@@ -3,7 +3,7 @@ - * SPDX-License-Identifier: Apache-2.0 - */ - --#include "../../../common.h" -+#include "common.h" - #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) - - /* -diff --git a/mlkem/native/aarch64/src/poly_opt.S b/mlkem/native/aarch64/src/poly_opt.S -index 555c60a6..815a9dd1 100644 ---- a/mlkem/native/aarch64/src/poly_opt.S -+++ b/mlkem/native/aarch64/src/poly_opt.S -@@ -3,7 +3,7 @@ - * SPDX-License-Identifier: Apache-2.0 - */ - --#include "../../../common.h" -+#include "common.h" - #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) - - /* -diff --git a/mlkem/native/aarch64/src/polyvec_clean.S b/mlkem/native/aarch64/src/polyvec_clean.S -index 0b6df634..c91675b4 100644 ---- a/mlkem/native/aarch64/src/polyvec_clean.S -+++ b/mlkem/native/aarch64/src/polyvec_clean.S -@@ -9,7 +9,7 @@ - // https://eprint.iacr.org/2021/986 - // https://github.com/neon-ntt/neon-ntt - --#include "../../../common.h" -+#include "common.h" - #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) - - // Input: -diff --git a/mlkem/native/aarch64/src/polyvec_opt.S b/mlkem/native/aarch64/src/polyvec_opt.S -index 7a27fda3..8300b682 100644 ---- a/mlkem/native/aarch64/src/polyvec_opt.S -+++ b/mlkem/native/aarch64/src/polyvec_opt.S -@@ -9,7 +9,7 @@ - // https://eprint.iacr.org/2021/986 - // https://github.com/neon-ntt/neon-ntt - --#include "../../../common.h" -+#include "common.h" - #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) - - // Input: -diff --git a/mlkem/native/aarch64/src/rej_uniform_asm_clean.S b/mlkem/native/aarch64/src/rej_uniform_asm_clean.S -index 9158d6c8..5151a05d 100644 ---- a/mlkem/native/aarch64/src/rej_uniform_asm_clean.S -+++ b/mlkem/native/aarch64/src/rej_uniform_asm_clean.S -@@ -18,7 +18,7 @@ - * - * Returns number of sampled 16-bit integers (at most MLKEM_N). - **************************************************/ --#include "../../../common.h" -+#include "common.h" - #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ - defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) - -diff --git a/mlkem/native/aarch64/src/rej_uniform_table.c b/mlkem/native/aarch64/src/rej_uniform_table.c -index 29cdbe95..50766034 100644 ---- a/mlkem/native/aarch64/src/rej_uniform_table.c -+++ b/mlkem/native/aarch64/src/rej_uniform_table.c -@@ -8,7 +8,7 @@ - * Do not modify it directly. - */ - --#include "../../../common.h" -+#include "common.h" - - #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ - defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -diff --git a/mlkem/native/api.h b/mlkem/native/api.h -index 5732b97c..792ecb8a 100644 ---- a/mlkem/native/api.h -+++ b/mlkem/native/api.h -@@ -23,8 +23,8 @@ - #define MLKEM_NATIVE_ARITH_NATIVE_API_H - - #include --#include "../poly.h" --#include "../polyvec.h" -+#include "poly.h" -+#include "polyvec.h" - - /* - * This is the C<->native interface allowing for the drop-in of -diff --git a/mlkem/native/default.h b/mlkem/native/default.h -index f9fe4310..d1e41c52 100644 ---- a/mlkem/native/default.h -+++ b/mlkem/native/default.h -@@ -8,7 +8,7 @@ - /* - * Default arithmetic backend - */ --#include "../sys.h" -+#include "sys.h" - - #ifdef SYS_AARCH64 - /* -diff --git a/mlkem/native/x86_64/default.h b/mlkem/native/x86_64/default.h -index 73f53dc1..592e8996 100644 ---- a/mlkem/native/x86_64/default.h -+++ b/mlkem/native/x86_64/default.h -@@ -19,6 +19,6 @@ - /* Filename of the C backend implementation. - * This is not inlined here because this header is included in assembly - * files as well. */ --#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/x86_64/src/default_impl.h" -+#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "x86_64/src/default_impl.h" - - #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */ -diff --git a/mlkem/native/x86_64/src/arith_native_x86_64.h b/mlkem/native/x86_64/src/arith_native_x86_64.h -index acf3ae56..25e00a93 100644 ---- a/mlkem/native/x86_64/src/arith_native_x86_64.h -+++ b/mlkem/native/x86_64/src/arith_native_x86_64.h -@@ -5,11 +5,11 @@ - #ifndef MLKEM_X86_64_NATIVE_H - #define MLKEM_X86_64_NATIVE_H - --#include "../../../common.h" -+#include "common.h" - - #include - #include --#include "../../../polyvec.h" -+#include "polyvec.h" - #include "consts.h" - - #define REJ_UNIFORM_AVX_NBLOCKS 3 /* See MLKEM_GEN_MATRIX_NBLOCKS */ -diff --git a/mlkem/native/x86_64/src/basemul.S b/mlkem/native/x86_64/src/basemul.S -index 5fdc3d0a..b97840e7 100644 ---- a/mlkem/native/x86_64/src/basemul.S -+++ b/mlkem/native/x86_64/src/basemul.S -@@ -6,7 +6,7 @@ - // Implementation from Kyber reference repository - // https://github.com/pq-crystals/kyber/blob/main/avx2 - --#include "../../../common.h" -+#include "common.h" - #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) - - #include "consts.h" -diff --git a/mlkem/native/x86_64/src/basemul.c b/mlkem/native/x86_64/src/basemul.c -index 8a23ddcc..5f9ae99c 100644 ---- a/mlkem/native/x86_64/src/basemul.c -+++ b/mlkem/native/x86_64/src/basemul.c -@@ -3,12 +3,12 @@ - * SPDX-License-Identifier: Apache-2.0 - */ - --#include "../../../common.h" -+#include "common.h" - - #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) - --#include "../../../poly.h" --#include "../../../polyvec.h" -+#include "poly.h" -+#include "polyvec.h" - - #include "arith_native_x86_64.h" - #include "consts.h" -diff --git a/mlkem/native/x86_64/src/consts.c b/mlkem/native/x86_64/src/consts.c -index 568752ae..86a0835e 100644 ---- a/mlkem/native/x86_64/src/consts.c -+++ b/mlkem/native/x86_64/src/consts.c -@@ -8,7 +8,7 @@ - * https://github.com/pq-crystals/kyber/blob/main/avx2/consts.c - */ - --#include "../../../common.h" -+#include "common.h" - - #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) - -diff --git a/mlkem/native/x86_64/src/consts.h b/mlkem/native/x86_64/src/consts.h -index e2846b60..00c41595 100644 ---- a/mlkem/native/x86_64/src/consts.h -+++ b/mlkem/native/x86_64/src/consts.h -@@ -11,7 +11,7 @@ - #ifndef CONSTS_H - #define CONSTS_H - --#include "../../../common.h" -+#include "common.h" - - #define AVX2_BACKEND_DATA_OFFSET_16XQ 0 - #define AVX2_BACKEND_DATA_OFFSET_16XQINV 16 -diff --git a/mlkem/native/x86_64/src/default_impl.h b/mlkem/native/x86_64/src/default_impl.h -index cdbd44da..029111c1 100644 ---- a/mlkem/native/x86_64/src/default_impl.h -+++ b/mlkem/native/x86_64/src/default_impl.h -@@ -12,8 +12,8 @@ - - #include - --#include "../../../poly.h" --#include "../../../polyvec.h" -+#include "poly.h" -+#include "polyvec.h" - #include "arith_native_x86_64.h" - - #define MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER -diff --git a/mlkem/native/x86_64/src/fq.S b/mlkem/native/x86_64/src/fq.S -index 3f013a5f..134bd4f7 100644 ---- a/mlkem/native/x86_64/src/fq.S -+++ b/mlkem/native/x86_64/src/fq.S -@@ -11,7 +11,7 @@ - // in [0,1,...,q-1] rather than [0,1,...,q], matching the - // semantics of poly_reduce(). - --#include "../../../common.h" -+#include "common.h" - - #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) - #include "consts.h" -diff --git a/mlkem/native/x86_64/src/intt.S b/mlkem/native/x86_64/src/intt.S -index 7b1f2262..6b1d78ef 100644 ---- a/mlkem/native/x86_64/src/intt.S -+++ b/mlkem/native/x86_64/src/intt.S -@@ -9,7 +9,7 @@ - * Changes to placement of modular reductions have - * been made to simplify reasoning of non-overflow */ - --#include "../../../common.h" -+#include "common.h" - - #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) - -diff --git a/mlkem/native/x86_64/src/ntt.S b/mlkem/native/x86_64/src/ntt.S -index 5d928b4c..e8bf7894 100644 ---- a/mlkem/native/x86_64/src/ntt.S -+++ b/mlkem/native/x86_64/src/ntt.S -@@ -6,7 +6,7 @@ - // Implementation from Kyber reference repository - // https://github.com/pq-crystals/kyber/blob/main/avx2 - --#include "../../../common.h" -+#include "common.h" - #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) - - #include "consts.h" -diff --git a/mlkem/native/x86_64/src/rej_uniform_avx2.c b/mlkem/native/x86_64/src/rej_uniform_avx2.c -index adf2d338..54037a0d 100644 ---- a/mlkem/native/x86_64/src/rej_uniform_avx2.c -+++ b/mlkem/native/x86_64/src/rej_uniform_avx2.c -@@ -8,7 +8,7 @@ - * https://github.com/pq-crystals/kyber/blob/main/avx2 - */ - --#include "../../../common.h" -+#include "common.h" - - #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) - -diff --git a/mlkem/native/x86_64/src/rej_uniform_table.c b/mlkem/native/x86_64/src/rej_uniform_table.c -index e95fd9e7..9bbc4714 100644 ---- a/mlkem/native/x86_64/src/rej_uniform_table.c -+++ b/mlkem/native/x86_64/src/rej_uniform_table.c -@@ -8,7 +8,7 @@ - * Do not modify it directly. - */ - --#include "../../../common.h" -+#include "common.h" - - #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) - -diff --git a/mlkem/native/x86_64/src/shuffle.S b/mlkem/native/x86_64/src/shuffle.S -index 9bcd0489..5e708748 100644 ---- a/mlkem/native/x86_64/src/shuffle.S -+++ b/mlkem/native/x86_64/src/shuffle.S -@@ -6,7 +6,7 @@ - // Implementation from Kyber reference repository - // https://github.com/pq-crystals/kyber/blob/main/avx2 - --#include "../../../common.h" -+#include "common.h" - - #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) - + #include "poly_k.h" diff --git a/mlkem/poly.c b/mlkem/poly.c -index 26b358a2..7483ebf6 100644 +index 2bf10fd0..e8a2e2c6 100644 --- a/mlkem/poly.c +++ b/mlkem/poly.c -@@ -11,7 +11,7 @@ - #include "cbd.h" +@@ -10,7 +10,7 @@ + #include "arith_backend.h" #include "cbmc.h" #include "debug.h" -#include "fips202/fips202x4.h" +#include "fips202x4.h" - #include "ntt.h" #include "poly.h" - #include "reduce.h" -diff --git a/mlkem/rej_uniform.c b/mlkem/rej_uniform.c -index 626a440e..cbbe4407 100644 ---- a/mlkem/rej_uniform.c -+++ b/mlkem/rej_uniform.c + #include "sampling.h" + #include "symmetric.h" +diff --git a/mlkem/sampling.c b/mlkem/sampling.c +index 3402ab25..98cbdcb7 100644 +--- a/mlkem/sampling.c ++++ b/mlkem/sampling.c @@ -7,8 +7,8 @@ #include "arith_backend.h" @@ -718,10 +38,10 @@ index 626a440e..cbbe4407 100644 -#include "fips202/fips202x4.h" +#include "fips202.h" +#include "fips202x4.h" - #include "rej_uniform.h" + #include "sampling.h" #include "symmetric.h" -@@ -155,6 +155,8 @@ void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) +@@ -157,6 +157,8 @@ void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) xof_x4_ctx statex; unsigned int buflen; @@ -730,7 +50,7 @@ index 626a440e..cbbe4407 100644 /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */ xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3], MLKEM_SYMBYTES + 2); -@@ -205,6 +207,8 @@ void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) +@@ -207,6 +209,8 @@ void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; unsigned int ctr, buflen; diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt index fc2655ddf..102e8993c 100644 --- a/src/kem/ml_kem/CMakeLists.txt +++ b/src/kem/ml_kem/CMakeLists.txt @@ -6,7 +6,7 @@ set(_ML_KEM_OBJS "") if(OQS_ENABLE_KEM_ml_kem_512) - add_library(ml_kem_512_ref OBJECT kem_ml_kem_512.c mlkem-native_ml-kem-512_ref/cbd.c mlkem-native_ml-kem-512_ref/debug.c mlkem-native_ml-kem-512_ref/indcpa.c mlkem-native_ml-kem-512_ref/kem.c mlkem-native_ml-kem-512_ref/ntt.c mlkem-native_ml-kem-512_ref/poly.c mlkem-native_ml-kem-512_ref/polyvec.c mlkem-native_ml-kem-512_ref/rej_uniform.c mlkem-native_ml-kem-512_ref/verify.c mlkem-native_ml-kem-512_ref/zetas.c) + add_library(ml_kem_512_ref OBJECT kem_ml_kem_512.c mlkem-native_ml-kem-512_ref/compress.c mlkem-native_ml-kem-512_ref/debug.c mlkem-native_ml-kem-512_ref/indcpa.c mlkem-native_ml-kem-512_ref/kem.c mlkem-native_ml-kem-512_ref/poly.c mlkem-native_ml-kem-512_ref/poly_k.c mlkem-native_ml-kem-512_ref/sampling.c mlkem-native_ml-kem-512_ref/verify.c mlkem-native_ml-kem-512_ref/zetas.c) target_compile_options(ml_kem_512_ref PUBLIC -DMLKEM_K=2 -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_C) target_include_directories(ml_kem_512_ref PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_ref) target_include_directories(ml_kem_512_ref PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) @@ -15,24 +15,24 @@ if(OQS_ENABLE_KEM_ml_kem_512) endif() if(OQS_ENABLE_KEM_ml_kem_512_x86_64) - add_library(ml_kem_512_x86_64 OBJECT mlkem-native_ml-kem-512_x86_64/cbd.c mlkem-native_ml-kem-512_x86_64/debug.c mlkem-native_ml-kem-512_x86_64/indcpa.c mlkem-native_ml-kem-512_x86_64/kem.c mlkem-native_ml-kem-512_x86_64/ntt.c mlkem-native_ml-kem-512_x86_64/poly.c mlkem-native_ml-kem-512_x86_64/polyvec.c mlkem-native_ml-kem-512_x86_64/rej_uniform.c mlkem-native_ml-kem-512_x86_64/verify.c mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.c mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.S mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.c mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.S mlkem-native_ml-kem-512_x86_64/x86_64/src/intt.S mlkem-native_ml-kem-512_x86_64/x86_64/src/ntt.S mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.S mlkem-native_ml-kem-512_x86_64/zetas.c) + add_library(ml_kem_512_x86_64 OBJECT mlkem-native_ml-kem-512_x86_64/compress.c mlkem-native_ml-kem-512_x86_64/debug.c mlkem-native_ml-kem-512_x86_64/indcpa.c mlkem-native_ml-kem-512_x86_64/kem.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S mlkem-native_ml-kem-512_x86_64/poly.c mlkem-native_ml-kem-512_x86_64/poly_k.c mlkem-native_ml-kem-512_x86_64/sampling.c mlkem-native_ml-kem-512_x86_64/verify.c mlkem-native_ml-kem-512_x86_64/zetas.c) target_include_directories(ml_kem_512_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_x86_64) target_include_directories(ml_kem_512_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) target_compile_options(ml_kem_512_x86_64 PRIVATE -mavx2 -mbmi2 -mpopcnt ) - target_compile_options(ml_kem_512_x86_64 PUBLIC -DMLKEM_K=2 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_X86_64_DEFAULT) + target_compile_options(ml_kem_512_x86_64 PUBLIC -DMLKEM_K=2 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_X86_64_DEFAULT) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() if(OQS_ENABLE_KEM_ml_kem_512_aarch64) - add_library(ml_kem_512_aarch64 OBJECT mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-512_aarch64/cbd.c mlkem-native_ml-kem-512_aarch64/debug.c mlkem-native_ml-kem-512_aarch64/indcpa.c mlkem-native_ml-kem-512_aarch64/kem.c mlkem-native_ml-kem-512_aarch64/ntt.c mlkem-native_ml-kem-512_aarch64/poly.c mlkem-native_ml-kem-512_aarch64/polyvec.c mlkem-native_ml-kem-512_aarch64/rej_uniform.c mlkem-native_ml-kem-512_aarch64/verify.c mlkem-native_ml-kem-512_aarch64/zetas.c) + add_library(ml_kem_512_aarch64 OBJECT mlkem-native_ml-kem-512_aarch64/compress.c mlkem-native_ml-kem-512_aarch64/debug.c mlkem-native_ml-kem-512_aarch64/indcpa.c mlkem-native_ml-kem-512_aarch64/kem.c mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-512_aarch64/poly.c mlkem-native_ml-kem-512_aarch64/poly_k.c mlkem-native_ml-kem-512_aarch64/sampling.c mlkem-native_ml-kem-512_aarch64/verify.c mlkem-native_ml-kem-512_aarch64/zetas.c) target_include_directories(ml_kem_512_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_aarch64) target_include_directories(ml_kem_512_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) - target_compile_options(ml_kem_512_aarch64 PUBLIC -DMLKEM_K=2 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT) + target_compile_options(ml_kem_512_aarch64 PUBLIC -DMLKEM_K=2 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() if(OQS_ENABLE_KEM_ml_kem_768) - add_library(ml_kem_768_ref OBJECT kem_ml_kem_768.c mlkem-native_ml-kem-768_ref/cbd.c mlkem-native_ml-kem-768_ref/debug.c mlkem-native_ml-kem-768_ref/indcpa.c mlkem-native_ml-kem-768_ref/kem.c mlkem-native_ml-kem-768_ref/ntt.c mlkem-native_ml-kem-768_ref/poly.c mlkem-native_ml-kem-768_ref/polyvec.c mlkem-native_ml-kem-768_ref/rej_uniform.c mlkem-native_ml-kem-768_ref/verify.c mlkem-native_ml-kem-768_ref/zetas.c) + add_library(ml_kem_768_ref OBJECT kem_ml_kem_768.c mlkem-native_ml-kem-768_ref/compress.c mlkem-native_ml-kem-768_ref/debug.c mlkem-native_ml-kem-768_ref/indcpa.c mlkem-native_ml-kem-768_ref/kem.c mlkem-native_ml-kem-768_ref/poly.c mlkem-native_ml-kem-768_ref/poly_k.c mlkem-native_ml-kem-768_ref/sampling.c mlkem-native_ml-kem-768_ref/verify.c mlkem-native_ml-kem-768_ref/zetas.c) target_compile_options(ml_kem_768_ref PUBLIC -DMLKEM_K=3 -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_C) target_include_directories(ml_kem_768_ref PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_ref) target_include_directories(ml_kem_768_ref PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) @@ -41,24 +41,24 @@ if(OQS_ENABLE_KEM_ml_kem_768) endif() if(OQS_ENABLE_KEM_ml_kem_768_x86_64) - add_library(ml_kem_768_x86_64 OBJECT mlkem-native_ml-kem-768_x86_64/cbd.c mlkem-native_ml-kem-768_x86_64/debug.c mlkem-native_ml-kem-768_x86_64/indcpa.c mlkem-native_ml-kem-768_x86_64/kem.c mlkem-native_ml-kem-768_x86_64/ntt.c mlkem-native_ml-kem-768_x86_64/poly.c mlkem-native_ml-kem-768_x86_64/polyvec.c mlkem-native_ml-kem-768_x86_64/rej_uniform.c mlkem-native_ml-kem-768_x86_64/verify.c mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.c mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.S mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.c mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.S mlkem-native_ml-kem-768_x86_64/x86_64/src/intt.S mlkem-native_ml-kem-768_x86_64/x86_64/src/ntt.S mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.S mlkem-native_ml-kem-768_x86_64/zetas.c) + add_library(ml_kem_768_x86_64 OBJECT mlkem-native_ml-kem-768_x86_64/compress.c mlkem-native_ml-kem-768_x86_64/debug.c mlkem-native_ml-kem-768_x86_64/indcpa.c mlkem-native_ml-kem-768_x86_64/kem.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S mlkem-native_ml-kem-768_x86_64/poly.c mlkem-native_ml-kem-768_x86_64/poly_k.c mlkem-native_ml-kem-768_x86_64/sampling.c mlkem-native_ml-kem-768_x86_64/verify.c mlkem-native_ml-kem-768_x86_64/zetas.c) target_include_directories(ml_kem_768_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_x86_64) target_include_directories(ml_kem_768_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) target_compile_options(ml_kem_768_x86_64 PRIVATE -mavx2 -mbmi2 -mpopcnt ) - target_compile_options(ml_kem_768_x86_64 PUBLIC -DMLKEM_K=3 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_X86_64_DEFAULT) + target_compile_options(ml_kem_768_x86_64 PUBLIC -DMLKEM_K=3 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_X86_64_DEFAULT) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() if(OQS_ENABLE_KEM_ml_kem_768_aarch64) - add_library(ml_kem_768_aarch64 OBJECT mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-768_aarch64/cbd.c mlkem-native_ml-kem-768_aarch64/debug.c mlkem-native_ml-kem-768_aarch64/indcpa.c mlkem-native_ml-kem-768_aarch64/kem.c mlkem-native_ml-kem-768_aarch64/ntt.c mlkem-native_ml-kem-768_aarch64/poly.c mlkem-native_ml-kem-768_aarch64/polyvec.c mlkem-native_ml-kem-768_aarch64/rej_uniform.c mlkem-native_ml-kem-768_aarch64/verify.c mlkem-native_ml-kem-768_aarch64/zetas.c) + add_library(ml_kem_768_aarch64 OBJECT mlkem-native_ml-kem-768_aarch64/compress.c mlkem-native_ml-kem-768_aarch64/debug.c mlkem-native_ml-kem-768_aarch64/indcpa.c mlkem-native_ml-kem-768_aarch64/kem.c mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-768_aarch64/poly.c mlkem-native_ml-kem-768_aarch64/poly_k.c mlkem-native_ml-kem-768_aarch64/sampling.c mlkem-native_ml-kem-768_aarch64/verify.c mlkem-native_ml-kem-768_aarch64/zetas.c) target_include_directories(ml_kem_768_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_aarch64) target_include_directories(ml_kem_768_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) - target_compile_options(ml_kem_768_aarch64 PUBLIC -DMLKEM_K=3 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_OPT) + target_compile_options(ml_kem_768_aarch64 PUBLIC -DMLKEM_K=3 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_OPT) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() if(OQS_ENABLE_KEM_ml_kem_1024) - add_library(ml_kem_1024_ref OBJECT kem_ml_kem_1024.c mlkem-native_ml-kem-1024_ref/cbd.c mlkem-native_ml-kem-1024_ref/debug.c mlkem-native_ml-kem-1024_ref/indcpa.c mlkem-native_ml-kem-1024_ref/kem.c mlkem-native_ml-kem-1024_ref/ntt.c mlkem-native_ml-kem-1024_ref/poly.c mlkem-native_ml-kem-1024_ref/polyvec.c mlkem-native_ml-kem-1024_ref/rej_uniform.c mlkem-native_ml-kem-1024_ref/verify.c mlkem-native_ml-kem-1024_ref/zetas.c) + add_library(ml_kem_1024_ref OBJECT kem_ml_kem_1024.c mlkem-native_ml-kem-1024_ref/compress.c mlkem-native_ml-kem-1024_ref/debug.c mlkem-native_ml-kem-1024_ref/indcpa.c mlkem-native_ml-kem-1024_ref/kem.c mlkem-native_ml-kem-1024_ref/poly.c mlkem-native_ml-kem-1024_ref/poly_k.c mlkem-native_ml-kem-1024_ref/sampling.c mlkem-native_ml-kem-1024_ref/verify.c mlkem-native_ml-kem-1024_ref/zetas.c) target_compile_options(ml_kem_1024_ref PUBLIC -DMLKEM_K=4 -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_C) target_include_directories(ml_kem_1024_ref PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_ref) target_include_directories(ml_kem_1024_ref PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) @@ -67,19 +67,19 @@ if(OQS_ENABLE_KEM_ml_kem_1024) endif() if(OQS_ENABLE_KEM_ml_kem_1024_x86_64) - add_library(ml_kem_1024_x86_64 OBJECT mlkem-native_ml-kem-1024_x86_64/cbd.c mlkem-native_ml-kem-1024_x86_64/debug.c mlkem-native_ml-kem-1024_x86_64/indcpa.c mlkem-native_ml-kem-1024_x86_64/kem.c mlkem-native_ml-kem-1024_x86_64/ntt.c mlkem-native_ml-kem-1024_x86_64/poly.c mlkem-native_ml-kem-1024_x86_64/polyvec.c mlkem-native_ml-kem-1024_x86_64/rej_uniform.c mlkem-native_ml-kem-1024_x86_64/verify.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/intt.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/ntt.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.S mlkem-native_ml-kem-1024_x86_64/zetas.c) + add_library(ml_kem_1024_x86_64 OBJECT mlkem-native_ml-kem-1024_x86_64/compress.c mlkem-native_ml-kem-1024_x86_64/debug.c mlkem-native_ml-kem-1024_x86_64/indcpa.c mlkem-native_ml-kem-1024_x86_64/kem.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S mlkem-native_ml-kem-1024_x86_64/poly.c mlkem-native_ml-kem-1024_x86_64/poly_k.c mlkem-native_ml-kem-1024_x86_64/sampling.c mlkem-native_ml-kem-1024_x86_64/verify.c mlkem-native_ml-kem-1024_x86_64/zetas.c) target_include_directories(ml_kem_1024_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_x86_64) target_include_directories(ml_kem_1024_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) target_compile_options(ml_kem_1024_x86_64 PRIVATE -mavx2 -mbmi2 -mpopcnt ) - target_compile_options(ml_kem_1024_x86_64 PUBLIC -DMLKEM_K=4 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_DEFAULT) + target_compile_options(ml_kem_1024_x86_64 PUBLIC -DMLKEM_K=4 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_DEFAULT) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() if(OQS_ENABLE_KEM_ml_kem_1024_aarch64) - add_library(ml_kem_1024_aarch64 OBJECT mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_aarch64/cbd.c mlkem-native_ml-kem-1024_aarch64/debug.c mlkem-native_ml-kem-1024_aarch64/indcpa.c mlkem-native_ml-kem-1024_aarch64/kem.c mlkem-native_ml-kem-1024_aarch64/ntt.c mlkem-native_ml-kem-1024_aarch64/poly.c mlkem-native_ml-kem-1024_aarch64/polyvec.c mlkem-native_ml-kem-1024_aarch64/rej_uniform.c mlkem-native_ml-kem-1024_aarch64/verify.c mlkem-native_ml-kem-1024_aarch64/zetas.c) + add_library(ml_kem_1024_aarch64 OBJECT mlkem-native_ml-kem-1024_aarch64/compress.c mlkem-native_ml-kem-1024_aarch64/debug.c mlkem-native_ml-kem-1024_aarch64/indcpa.c mlkem-native_ml-kem-1024_aarch64/kem.c mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_aarch64/poly.c mlkem-native_ml-kem-1024_aarch64/poly_k.c mlkem-native_ml-kem-1024_aarch64/sampling.c mlkem-native_ml-kem-1024_aarch64/verify.c mlkem-native_ml-kem-1024_aarch64/zetas.c) target_include_directories(ml_kem_1024_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_aarch64) target_include_directories(ml_kem_1024_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) - target_compile_options(ml_kem_1024_aarch64 PUBLIC -DMLKEM_K=4 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_OPT) + target_compile_options(ml_kem_1024_aarch64 PUBLIC -DMLKEM_K=4 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_OPT) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/arith_backend.h index 0543b1bd1..ade31cda1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/arith_backend.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/arith_backend.h @@ -17,7 +17,7 @@ * Keep this _after_ the inclusion of the backend; otherwise, * the sanity checks won't have an effect. */ #if defined(MLKEM_NATIVE_CHECK_APIS) -#include "api.h" +#include "native/api.h" #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.c deleted file mode 100644 index 1e6b7c5d1..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED - -#include -#include "cbd.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) -#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) -/* End of static namespacing */ - -/************************************************* - * Name: load32_littleendian - * - * Description: load 4 bytes into a 32-bit integer - * in little-endian order - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x - **************************************************/ -static uint32_t load32_littleendian(const uint8_t x[4]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - r |= (uint32_t)x[3] << 24; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) - { - unsigned j; - uint32_t t = load32_littleendian(buf + 4 * i); - uint32_t d = t & 0x55555555; - d += (t >> 1) & 0x55555555; - - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) - { - const int16_t a = (d >> (4 * j + 0)) & 0x3; - const int16_t b = (d >> (4 * j + 2)) & 0x3; - r->coeffs[8 * i + j] = a - b; - } - } -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -/************************************************* - * Name: load24_littleendian - * - * Description: load 3 bytes into a 32-bit integer - * in little-endian order. - * This function is only needed for ML-KEM-512 - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) - **************************************************/ -static uint32_t load24_littleendian(const uint8_t x[3]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 4; i++) - __loop__( - invariant(i <= MLKEM_N / 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) - { - unsigned j; - const uint32_t t = load24_littleendian(buf + 3 * i); - uint32_t d = t & 0x00249249; - d += (t >> 1) & 0x00249249; - d += (t >> 2) & 0x00249249; - - for (j = 0; j < 4; j++) - __loop__( - invariant(i <= MLKEM_N / 4 && j <= 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) - { - const int16_t a = (d >> (6 * j + 0)) & 0x7; - const int16_t b = (d >> (6 * j + 3)) & 0x7; - r->coeffs[4 * i + j] = a - b; - } - } -} -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ - 3 */ - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd) -int empty_cu_cbd; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.h deleted file mode 100644 index 54c1f5b90..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef CBD_H -#define CBD_H - -#include -#include "common.h" -#include "poly.h" - -#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) -/************************************************* - * Name: poly_cbd2 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=2 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) -/************************************************* - * Name: poly_cbd3 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=3. - * This function is only needed for ML-KEM-512 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ - -#endif /* CBD_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h index 4f326333e..62ed53ab1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h @@ -15,12 +15,19 @@ #include "sys.h" /* Include backend metadata */ -#if defined(MLKEM_USE_NATIVE) -#if defined(MLKEM_NATIVE_ARITH_BACKEND) -#include MLKEM_NATIVE_ARITH_BACKEND +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#include MLKEM_NATIVE_ARITH_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not. +#endif #endif -#if defined(MLKEM_NATIVE_FIPS202_BACKEND) -#include MLKEM_NATIVE_FIPS202_BACKEND + +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#include MLKEM_NATIVE_FIPS202_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not. #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/compress.c new file mode 100644 index 000000000..a03fe0ac4 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/compress.c @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "common.h" +#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) + +#include +#include +#include "arith_backend.h" +#include "cbmc.h" +#include "compress.h" +#include "debug.h" +#include "verify.h" + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 16))) + { + t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); + } + + r[i * 4] = t[0] | (t[1] << 4); + r[i * 4 + 1] = t[2] | (t[3] << 4); + r[i * 4 + 2] = t[4] | (t[5] << 4); + r[i * 4 + 3] = t[6] | (t[7] << 4); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + for (j = 0; j < MLKEM_N / 4; j++) + __loop__(invariant(j <= MLKEM_N / 4)) + { + unsigned k; + uint16_t t[4]; + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(forall(r, 0, k, t[r] < (1u << 10)))) + { + t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 10-bit in size. + */ + r[5 * j + 0] = (t[0] >> 0) & 0xFF; + r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); + r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); + r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); + r[5 * j + 4] = (t[3] >> 2); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) + { + r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); + r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 4; j++) + __loop__( + invariant(j <= MLKEM_N / 4) + invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[4]; + uint8_t const *base = &a[5 * j]; + + t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); + t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); + t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); + + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) + { + r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 32))) + { + t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); + } + + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC, and use array indexing into + * r rather than pointer-arithmetic to simplify verification + */ + r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); + r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); + r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); + r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); + r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (j = 0; j < MLKEM_N / 8; j++) + __loop__(invariant(j <= MLKEM_N / 8)) + { + unsigned k; + uint16_t t[8]; + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(forall(r, 0, k, t[r] < (1u << 11)))) + { + t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 11-bit in size. + */ + r[11 * j + 0] = (t[0] >> 0) & 0xFF; + r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); + r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); + r[11 * j + 3] = (t[2] >> 2) & 0xFF; + r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); + r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); + r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); + r[11 * j + 7] = (t[5] >> 1) & 0xFF; + r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); + r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); + r[11 * j + 10] = (t[7] >> 3); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + uint8_t t[8]; + const unsigned offset = i * 5; + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC and unwind loop for ease + * of proof. + */ + + /* + * Decompress 5 8-bit bytes (so 40 bits) into + * 8 5-bit values stored in t[] + */ + t[0] = 0x1F & (a[offset + 0] >> 0); + t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); + t[2] = 0x1F & (a[offset + 1] >> 2); + t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); + t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); + t[5] = 0x1F & (a[offset + 3] >> 1); + t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); + t[7] = 0x1F & (a[offset + 4] >> 3); + + /* and copy to the correct slice in r[] */ + for (j = 0; j < 8; j++) + __loop__( + invariant(j <= 8 && i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 8; j++) + __loop__( + invariant(j <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[8]; + uint8_t const *base = &a[11 * j]; + t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); + t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | + ((uint16_t)base[4] << 10)); + t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); + t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); + t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | + ((uint16_t)base[8] << 9)); + t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); + t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); + + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) + { + r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ + +#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 2; i++) + __loop__(invariant(i <= MLKEM_N / 2)) + { + const uint16_t t0 = a->coeffs[2 * i]; + const uint16_t t1 = a->coeffs[2 * i + 1]; + /* + * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of + * significant data, so these can be packed into 24 bits or exactly + * 3 bytes, as follows. + */ + + /* Least significant bits 0 - 7 of t0. */ + r[3 * i + 0] = t0 & 0xFF; + + /* + * Most significant bits 8 - 11 of t0 become the least significant + * nibble of the second byte. The least significant 4 bits + * of t1 become the upper nibble of the second byte. + */ + r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + + /* Bits 4 - 11 of t1 become the third byte. */ + r[3 * i + 2] = t1 >> 4; + } +} +#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + poly_tobytes_native(r, a->coeffs); +} +#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + +#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) + { + const uint8_t t0 = a[3 * i + 0]; + const uint8_t t1 = a[3 * i + 1]; + const uint8_t t2 = a[3 * i + 2]; + r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); + r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); + } + + /* Note that the coefficients are not canonical */ + debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); +} +#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + poly_frombytes_native(r->coeffs, a); +} +#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ + +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +{ + unsigned i; +#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) +#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" +#endif + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + for (j = 0; j < 8; j++) + __loop__( + invariant(i < MLKEM_N / 8 && j <= 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + /* Prevent the compiler from recognizing this as a bit selection */ + uint8_t mask = value_barrier_u8(1u << j); + r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); + } + } + debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + msg[i] = 0; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8)) + { + uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); + msg[i] |= t << j; + } + } +} + +#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ + +#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress) +int empty_cu_compress; + +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/compress.h new file mode 100644 index 000000000..409dbe519 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/compress.h @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef COMPRESS_H +#define COMPRESS_H + +#include +#include +#include "cbmc.h" +#include "common.h" +#include "debug.h" +#include "poly.h" +#include "verify.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) +#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) +#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) +#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) +#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) +#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) +#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) +#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) +#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) +/* End of static namespacing */ + +/************************************************************ + * Name: scalar_compress_d1 + * + * Description: Computes round(u * 2 / q) + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 1. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d1(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 2) + ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) +{ + uint32_t d0 = u << 1; + d0 *= 645083; + d0 += 1u << 30; + d0 >>= 31; + return d0; +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_compress_d4 + * + * Description: Computes round(u * 16 / q) % 16 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d4(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 16) + ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) +{ + uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ + return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d4 + * + * Description: Computes round(u * q / 16) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d4(uint32_t u) +__contract__( + requires(0 <= u && u < 16) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 8) / 16; } + +/************************************************************ + * Name: scalar_compress_d5 + * + * Description: Computes round(u * 32 / q) % 32 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d5(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 32) + ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) +{ + uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ + return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d5 + * + * Description: Computes round(u * q / 32) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo 32 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d5(uint32_t u) +__contract__( + requires(0 <= u && u < 32) + ensures(return_value <= MLKEM_Q - 1) +) { return ((u * MLKEM_Q) + 16) / 32; } + +/************************************************************ + * Name: scalar_compress_d10 + * + * Description: Computes round(u * 2**10 / q) % 2**10 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d10(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 10)) + ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) +{ + uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x3FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d10 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d10(uint32_t u) +__contract__( + requires(0 <= u && u < 1024) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 512) / 1024; } + +/************************************************************ + * Name: scalar_compress_d11 + * + * Description: Computes round(u * 2**11 / q) % 2**11 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 11. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d11(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 11)) + ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) +{ + uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x7FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d11 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d11(uint32_t u) +__contract__( + requires(0 <= u && u < 2048) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 1024) / 2048; } + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ + (MLKEM_K == 2 || MLKEM_K == 3) +#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) +/************************************************* + * Name: poly_compress_d4 + * + * Description: Compression (4 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); + +#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) +/************************************************* + * Name: poly_compress_d10 + * + * Description: Compression (10 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); + +#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) +/************************************************* + * Name: poly_decompress_d4 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); + +#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) +/************************************************* + * Name: poly_decompress_d10 + * + * Description: De-serialization and subsequent decompression (10 bits) of a + * polynomial; approximate inverse of poly_compress_d10 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) +/************************************************* + * Name: poly_compress_d5 + * + * Description: Compression (5 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); + +#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) +/************************************************* + * Name: poly_compress_d11 + * + * Description: Compression (11 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); + +#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) +/************************************************* + * Name: poly_decompress_d5 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); + +#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) +/************************************************* + * Name: poly_decompress_d11 + * + * Description: De-serialization and subsequent decompression (11 bits) of a + * polynomial; approximate inverse of poly_compress_d11 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ + */ + +#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) +/************************************************* + * Name: poly_tobytes + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - a: const pointer to input polynomial, + * with each coefficient in the range [0,1,..,Q-1] + * OUTPUT + * - r: pointer to output byte array + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYBYTES)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(r)) +); + + +#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) +/************************************************* + * Name: poly_frombytes + * + * Description: De-serialization of a polynomial. + * + * Arguments: INPUT + * - a: pointer to input byte array + * (of MLKEM_POLYBYTES bytes) + * OUTPUT + * - r: pointer to output polynomial, with + * each coefficient unsigned and in the range + * 0 .. 4095 + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) +); + + +#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) +/************************************************* + * Name: poly_frommsg + * + * Description: Convert 32-byte message to polynomial + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *msg: pointer to input message + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(object_whole(r)) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) +); + +#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) +/************************************************* + * Name: poly_tomsg + * + * Description: Convert polynomial to 32-byte message + * + * Arguments: - uint8_t *msg: pointer to output message + * - const poly *r: pointer to input polynomial + * Coefficients must be unsigned canonical + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(msg)) +); + +#endif /* COMPRESS_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h index fa89370ce..e975ede95 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h @@ -122,46 +122,87 @@ /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ /****************************************************************************** - * Name: MLKEM_USE_NATIVE + * Name: MLKEM_USE_NATIVE_BACKEND_ARITH * - * Description: Determines whether a native backend should - * be used, if available. + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. * * This can also be set using CFLAGS. * *****************************************************************************/ -#if !defined(MLKEM_USE_NATIVE) -/* #define MLKEM_USE_NATIVE */ +#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */ #endif /****************************************************************************** - * Name: MLKEM_NATIVE_ARITH_BACKEND + * Name: MLKEM_NATIVE_ARITH_BACKEND_FILE * * Description: The arithmetic backend to use. * - * This must be the filename of an arithmetic backend. - * See the existing backends for examples. + * If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND) -#define MLKEM_NATIVE_ARITH_BACKEND "default.h" -#endif /* MLKEM_NATIVE_ARITH_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \ + !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h" +#endif /****************************************************************************** - * Name: MLKEM_NATIVE_FIPS202_BACKEND + * Name: MLKEM_USE_NATIVE_BACKEND_FIPS202 + * + * Description: Determines whether an native FIPS202 backend should be used. + * + * The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is + * the performance bottleneck of SHA3 and SHAKE. + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the FIPS202 backend to be use is + * determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */ +#endif + +/****************************************************************************** + * Name: MLKEM_NATIVE_FIPS202_BACKEND_FILE * * Description: The FIPS-202 backend to use. * - * This must be the filename of an FIPS-202 backend. + * If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must + * either be undefined or the filename of a FIPS202 backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND) -#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h" -#endif /* MLKEM_NATIVE_FIPS202_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \ + !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h" +#endif /************************* Config internals ********************************/ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.c index 0cfcc3e9e..318d0fc77 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.c @@ -9,11 +9,10 @@ #include "fips202.h" #include "fips202x4.h" #include "indcpa.h" -#include "ntt.h" #include "poly.h" -#include "polyvec.h" +#include "poly_k.h" #include "randombytes.h" -#include "rej_uniform.h" +#include "sampling.h" #include "symmetric.h" #include "arith_backend.h" @@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v, #define poly_permute_bitrev_to_custom \ MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom) -static INLINE void poly_permute_bitrev_to_custom(poly *data) +static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N]) __contract__( /* We don't specify that this should be a permutation, but only * that it does not change the bound established at the end of gen_matrix. */ - requires(memory_no_alias(data, sizeof(poly))) - requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N)) + requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q)) assigns(memory_slice(data, sizeof(poly))) - ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } + ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ /* Not static for benchmarking */ @@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed) { for (j = 0; j < MLKEM_K; j++) { - poly_permute_bitrev_to_custom(&a[i].vec[j]); + poly_permute_bitrev_to_custom(a[i].vec[j].coeffs); } } } diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.h index 2c4fda3c4..b4d5985bf 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.h @@ -8,7 +8,7 @@ #include #include "cbmc.h" #include "common.h" -#include "polyvec.h" +#include "poly_k.h" #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix) /************************************************* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/README.md similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/README.md rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/README.md diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/clean.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/clean.h similarity index 90% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/clean.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/clean.h index 43a401dfc..f124702a4 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/clean.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/clean.h @@ -19,6 +19,6 @@ /* Filename of the C backend implementation. * This is not inlined here because this header is included in assembly * files as well. */ -#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/clean_impl.h" +#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/clean_impl.h" #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/opt.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/opt.h similarity index 91% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/opt.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/opt.h index 04323c3e7..a7217163f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/opt.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/opt.h @@ -19,6 +19,6 @@ /* Filename of the C backend implementation. * This is not inlined here because this header is included in assembly * files as well. */ -#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/opt_impl.h" +#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/opt_impl.h" #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c index 1e189fd99..b3a6f198f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c @@ -8,7 +8,7 @@ * Do not modify it directly. */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/arith_native_aarch64.h similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/arith_native_aarch64.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/arith_native_aarch64.h index fc4e7dd38..a784a3027 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/arith_native_aarch64.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/arith_native_aarch64.h @@ -6,7 +6,7 @@ #define MLKEM_AARCH64_NATIVE_H #include -#include "common.h" +#include "../../../common.h" #define aarch64_ntt_zetas_layer01234 \ MLKEM_NAMESPACE(aarch64_ntt_zetas_layer01234) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/clean_impl.h similarity index 58% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/clean_impl.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/clean_impl.h index 548b1eebb..ded7d067a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/clean_impl.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/clean_impl.h @@ -12,9 +12,6 @@ #include "arith_native_aarch64.h" -#include "poly.h" -#include "polyvec.h" - /* Set of primitives that this backend replaces */ #define MLKEM_USE_NATIVE_NTT #define MLKEM_USE_NATIVE_INTT @@ -25,45 +22,46 @@ #define MLKEM_USE_NATIVE_POLY_TOBYTES #define MLKEM_USE_NATIVE_REJ_UNIFORM -static INLINE void ntt_native(poly *data) +static INLINE void ntt_native(int16_t data[MLKEM_N]) { - ntt_asm_clean(data->coeffs, aarch64_ntt_zetas_layer01234, - aarch64_ntt_zetas_layer56); + ntt_asm_clean(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56); } -static INLINE void intt_native(poly *data) +static INLINE void intt_native(int16_t data[MLKEM_N]) { - intt_asm_clean(data->coeffs, aarch64_invntt_zetas_layer01234, + intt_asm_clean(data, aarch64_invntt_zetas_layer01234, aarch64_invntt_zetas_layer56); } -static INLINE void poly_reduce_native(poly *data) +static INLINE void poly_reduce_native(int16_t data[MLKEM_N]) { - poly_reduce_asm_clean(data->coeffs); + poly_reduce_asm_clean(data); } -static INLINE void poly_tomont_native(poly *data) + +static INLINE void poly_tomont_native(int16_t data[MLKEM_N]) { - poly_tomont_asm_clean(data->coeffs); + poly_tomont_asm_clean(data); } -static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y) +static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2], + const int16_t y[MLKEM_N]) { - poly_mulcache_compute_asm_clean(x->coeffs, y->coeffs, - aarch64_zetas_mulcache_native, + poly_mulcache_compute_asm_clean(x, y, aarch64_zetas_mulcache_native, aarch64_zetas_mulcache_twisted_native); } + static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache) + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]) { - polyvec_basemul_acc_montgomery_cached_asm_clean( - r->coeffs, a->vec[0].coeffs, b->vec[0].coeffs, b_cache->vec[0].coeffs); + polyvec_basemul_acc_montgomery_cached_asm_clean(r, a, b, b_cache); } static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a) + const int16_t a[MLKEM_N]) { - poly_tobytes_asm_clean(r, a->coeffs); + poly_tobytes_asm_clean(r, a); } static INLINE int rej_uniform_native(int16_t *r, unsigned int len, diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/consts.h similarity index 94% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/consts.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/consts.h index c40947299..e3ea26a27 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/consts.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/consts.h @@ -7,7 +7,7 @@ #define MLKEM_NATIVE_AARCH64_CONSTS #include -#include "common.h" +#include "../../../common.h" #define zetas_mulcache_native MLKEM_NAMESPACE(zetas_mulcache_native) extern const int16_t zetas_mulcache_native[256]; diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S index b243a569d..28ad38975 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S @@ -23,7 +23,7 @@ /// SOFTWARE. /// -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) // Bounds: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S index c94746e17..857c729cb 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S @@ -23,7 +23,7 @@ /// SOFTWARE. /// -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) // Bounds: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S index cd63cc4d6..30fdc76b0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S @@ -24,7 +24,7 @@ /// SOFTWARE. /// -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) // Bounds: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_opt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_opt.S index 8705615b7..431f9dc6f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_opt.S @@ -24,7 +24,7 @@ /// SOFTWARE. /// -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) // Bounds: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/opt_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/opt_impl.h similarity index 58% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/opt_impl.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/opt_impl.h index ec1bf6587..eb8e39ed0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/opt_impl.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/opt_impl.h @@ -10,11 +10,9 @@ #else #define MLKEM_NATIVE_ARITH_PROFILE_IMPL_H +#include "../../../params.h" #include "arith_native_aarch64.h" -#include "poly.h" -#include "polyvec.h" - /* Set of primitives that this backend replaces */ #define MLKEM_USE_NATIVE_NTT #define MLKEM_USE_NATIVE_INTT @@ -25,45 +23,46 @@ #define MLKEM_USE_NATIVE_POLY_TOBYTES #define MLKEM_USE_NATIVE_REJ_UNIFORM -static INLINE void ntt_native(poly *data) +static INLINE void ntt_native(int16_t data[MLKEM_N]) { - ntt_asm_opt(data->coeffs, aarch64_ntt_zetas_layer01234, - aarch64_ntt_zetas_layer56); + ntt_asm_opt(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56); } -static INLINE void intt_native(poly *data) +static INLINE void intt_native(int16_t data[MLKEM_N]) { - intt_asm_opt(data->coeffs, aarch64_invntt_zetas_layer01234, + intt_asm_opt(data, aarch64_invntt_zetas_layer01234, aarch64_invntt_zetas_layer56); } -static INLINE void poly_reduce_native(poly *data) +static INLINE void poly_reduce_native(int16_t data[MLKEM_N]) { - poly_reduce_asm_opt(data->coeffs); + poly_reduce_asm_opt(data); } -static INLINE void poly_tomont_native(poly *data) + +static INLINE void poly_tomont_native(int16_t data[MLKEM_N]) { - poly_tomont_asm_opt(data->coeffs); + poly_tomont_asm_opt(data); } -static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y) +static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2], + const int16_t y[MLKEM_N]) { - poly_mulcache_compute_asm_opt(x->coeffs, y->coeffs, - aarch64_zetas_mulcache_native, + poly_mulcache_compute_asm_opt(x, y, aarch64_zetas_mulcache_native, aarch64_zetas_mulcache_twisted_native); } + static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache) + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]) { - polyvec_basemul_acc_montgomery_cached_asm_opt( - r->coeffs, a->vec[0].coeffs, b->vec[0].coeffs, b_cache->vec[0].coeffs); + polyvec_basemul_acc_montgomery_cached_asm_opt(r, a, b, b_cache); } static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a) + const int16_t a[MLKEM_N]) { - poly_tobytes_asm_opt(r, a->coeffs); + poly_tobytes_asm_opt(r, a); } static INLINE int rej_uniform_native(int16_t *r, unsigned int len, diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/optimize.sh b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/optimize.sh similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/optimize.sh rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/optimize.sh diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_clean.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_clean.S index 809f9667e..f3ee0796f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_clean.S @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) /* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S index 815a9dd1a..555c60a67 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) /* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S index c91675b44..0b6df6345 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S @@ -9,7 +9,7 @@ // https://eprint.iacr.org/2021/986 // https://github.com/neon-ntt/neon-ntt -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) // Input: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S index 8300b682c..7a27fda3e 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S @@ -9,7 +9,7 @@ // https://eprint.iacr.org/2021/986 // https://github.com/neon-ntt/neon-ntt -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) // Input: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S index 5151a05d0..9158d6c82 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S @@ -18,7 +18,7 @@ * * Returns number of sampled 16-bit integers (at most MLKEM_N). **************************************************/ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_table.c rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c index 507660349..29cdbe95f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_table.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c @@ -8,7 +8,7 @@ * Do not modify it directly. */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/api.h similarity index 90% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/api.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/api.h index 792ecb8a4..0704f9dcd 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/api.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/api.h @@ -23,8 +23,7 @@ #define MLKEM_NATIVE_ARITH_NATIVE_API_H #include -#include "poly.h" -#include "polyvec.h" +#include "../common.h" /* * This is the C<->native interface allowing for the drop-in of @@ -65,9 +64,9 @@ * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER * for more information. * - * Arguments: - poly *p: pointer to in/output polynomial + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial **************************************************/ -static INLINE void ntt_native(poly *); +static INLINE void ntt_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_NTT */ #if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER) @@ -96,10 +95,10 @@ and to/from bytes conversions." * * This must only be defined if there is native code for * all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache. - * Arguments: - poly *p: pointer to in/output polynomial + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial * **************************************************/ -static INLINE void poly_permute_bitrev_to_custom(poly *); +static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ #if defined(MLKEM_USE_NATIVE_INTT) @@ -117,7 +116,7 @@ static INLINE void poly_permute_bitrev_to_custom(poly *); * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -static INLINE void intt_native(poly *); +static INLINE void intt_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_INTT */ #if defined(MLKEM_USE_NATIVE_POLY_REDUCE) @@ -126,9 +125,9 @@ static INLINE void intt_native(poly *); * * Description: Applies modular reduction to all coefficients of a polynomial. * - * Arguments: - poly *r: pointer to input/output polynomial + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial **************************************************/ -static INLINE void poly_reduce_native(poly *); +static INLINE void poly_reduce_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ #if defined(MLKEM_USE_NATIVE_POLY_TOMONT) @@ -138,9 +137,9 @@ static INLINE void poly_reduce_native(poly *); * Description: Inplace conversion of all coefficients of a polynomial * from normal domain to Montgomery domain * - * Arguments: - poly *r: pointer to input/output polynomial + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial **************************************************/ -static INLINE void poly_tomont_native(poly *); +static INLINE void poly_tomont_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ #if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) @@ -165,8 +164,8 @@ static INLINE void poly_tomont_native(poly *); * OUTPUT * - cache: pointer to multiplication cache **************************************************/ -static INLINE void poly_mulcache_compute_native(poly_mulcache *cache, - const poly *poly); +static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2], + const int16_t poly[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ #if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) @@ -189,8 +188,9 @@ static INLINE void poly_mulcache_compute_native(poly_mulcache *cache, * in NTT domain, and of the same order as a and b. **************************************************/ static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache); + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]); #endif #if defined(MLKEM_USE_NATIVE_POLY_TOBYTES) @@ -209,7 +209,7 @@ static INLINE void polyvec_basemul_acc_montgomery_cached_native( * (of MLKEM_POLYBYTES bytes) **************************************************/ static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a); + const int16_t a[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ #if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) @@ -226,7 +226,7 @@ static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], * - a: const pointer to input byte aray * (of MLKEM_POLYBYTES bytes) **************************************************/ -static INLINE void poly_frombytes_native(poly *a, +static INLINE void poly_frombytes_native(int16_t a[MLKEM_N], const uint8_t r[MLKEM_POLYBYTES]); #endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/default.h similarity index 97% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/default.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/default.h index d1e41c52e..f9fe4310a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/default.h @@ -8,7 +8,7 @@ /* * Default arithmetic backend */ -#include "sys.h" +#include "../sys.h" #ifdef SYS_AARCH64 /* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.c deleted file mode 100644 index 3651c8da9..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) - -#include -#include "arith_backend.h" -#include "debug.h" -#include "ntt.h" -#include "reduce.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) -#define ntt_layer MLKEM_NAMESPACE(ntt_layer) -#define invntt_layer MLKEM_NAMESPACE(invntt_layer) -/* End of static namespacing */ - -#if !defined(MLKEM_USE_NATIVE_NTT) -/* - * Computes a block CT butterflies with a fixed twiddle factor, - * using Montgomery multiplication. - * Parameters: - * - r: Pointer to base of polynomial (_not_ the base of butterfly block) - * - root: Twiddle factor to use for the butterfly. This must be in - * Montgomery form and signed canonical. - * - start: Offset to the beginning of the butterfly block - * - len: Index difference between coefficients subject to a butterfly - * - bound: Ghost variable describing coefficient bound: Prior to `start`, - * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, - * they must be bound by `bound`. - * When this function returns, output coefficients in the index range - * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. - * Example: - * - start=8, len=4 - * This would compute the following four butterflies - * 8 -- 12 - * 9 -- 13 - * 10 -- 14 - * 11 -- 15 - * - start=4, len=2 - * This would compute the following two butterflies - * 4 -- 6 - * 5 -- 7 - */ -static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, - unsigned start, unsigned len, int bound) -__contract__( - requires(start < MLKEM_N) - requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) - requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) - requires(-HALF_Q < zeta && zeta < HALF_Q) - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) - requires(array_abs_bound(r, start, MLKEM_N, bound)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) - ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) -{ - /* `bound` is a ghost variable only needed in the CBMC specification */ - unsigned j; - ((void)bound); - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - /* - * Coefficients are updated in strided pairs, so the bounds for the - * intermediate states alternate twice between the old and new bound - */ - invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j, start + len, bound)) - invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) - { - int16_t t; - t = fqmul(r[j + len], zeta); - r[j + len] = r[j] - t; - r[j] = r[j] + t; - } -} - -/* - *Compute one layer of forward NTT - * Parameters: - * - r: Pointer to base of polynomial - * - len: Stride of butterflies in this layer. - * - layer: Ghost variable indicating which layer is being applied. - * Must match `len` via `len == MLKEM_N >> layer`. - * Note: `len` could be dropped and computed in the function, but - * we are following the structure of the reference NTT from the - * official Kyber implementation here, merely adding `layer` as - * a ghost variable for the specifications. - */ -static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) - requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable only needed in the CBMC specification */ - ((void)layer); - /* Twiddle factors for layer n start at index 2^(layer-1) */ - k = MLKEM_N / (2 * len); - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(start < MLKEM_N + 2 * len) - invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) - invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) - invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) - { - int16_t zeta = zetas[k++]; - ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); - } -} - -/* - * Compute full forward NTT - * NOTE: This particular implementation satisfies a much tighter - * bound on the output coefficients (5*q) than the contractual one (8*q), - * but this is not needed in the calling code. Should we change the - * base multiplication strategy to require smaller NTT output bounds, - * the proof may need strengthening. - */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - unsigned len, layer; - int16_t *r; - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - r = p->coeffs; - - for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) - __loop__( - invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) - invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) - { - ntt_layer(r, len, layer); - } - - /* Check the stronger bound */ - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_NTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - ntt_native(p); - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_NTT */ - -#if !defined(MLKEM_USE_NATIVE_INTT) - -/* Compute one layer of inverse NTT */ -static void invntt_layer(int16_t *r, unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) - requires(len == (1 << (8 - layer))) - requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable used only in the specification */ - ((void)layer); - k = MLKEM_N / len - 1; - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - invariant(start <= MLKEM_N && k <= 127) - /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ - invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) - { - unsigned j; - int16_t zeta = zetas[k--]; - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - invariant(start <= MLKEM_N && k <= 127) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - int16_t t = r[j]; - r[j] = barrett_reduce(t + r[j + len]); - r[j + len] = r[j + len] - t; - r[j + len] = fqmul(r[j + len], zeta); - } - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - /* - * Scale input polynomial to account for Montgomery factor - * and NTT twist. This also brings coefficients down to - * absolute value < MLKEM_Q. - */ - unsigned j, len, layer; - const int16_t f = 1441; - int16_t *r = p->coeffs; - - for (j = 0; j < MLKEM_N; j++) - __loop__( - invariant(j <= MLKEM_N) - invariant(array_abs_bound(r, 0, j, MLKEM_Q))) - { - r[j] = fqmul(r[j], f); - } - - /* Run the invNTT layers */ - for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) - __loop__( - invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - invntt_layer(p->coeffs, len, layer); - } - - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - intt_native(p); - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -{ - int32_t t0, t1; - debug_assert_bound(a, 2, 0, UINT12_LIMIT); - - t0 = (int32_t)a[1] * b_cached; - t0 += (int32_t)a[0] * b[0]; - t1 = (int32_t)a[0] * b[1]; - t1 += (int32_t)a[1] * b[0]; - - /* |ti| < 2 * q * 2^15 */ - r[0] = montgomery_reduce(t0); - r[1] = montgomery_reduce(t1); - - debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); -} - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt) -int empty_cu_ntt; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.h deleted file mode 100644 index 4e80d3ab3..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef NTT_H -#define NTT_H -#include "common.h" - -#include -#include "cbmc.h" -#include "poly.h" -#include "reduce.h" - -#define zetas MLKEM_NAMESPACE(zetas) -extern const int16_t zetas[128]; - -#define poly_ntt MLKEM_NAMESPACE(poly_ntt) -/************************************************* - * Name: poly_ntt - * - * Description: Computes negacyclic number-theoretic transform (NTT) of - * a polynomial in place. - * - * The input is assumed to be in normal order and - * coefficient-wise bound by MLKEM_Q in absolute value. - * - * The output polynomial is in bitreversed order, and - * coefficient-wise bound by NTT_BOUND in absolute value. - * - * (NOTE: Sometimes the input to the NTT is actually smaller, - * which gives better bounds.) - * - * Arguments: - poly *p: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) -); - -#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) -/************************************************* - * Name: poly_invntt_tomont - * - * Description: Computes inverse of negacyclic number-theoretic transform (NTT) - * of a polynomial in place; - * inputs assumed to be in bitreversed order, output in normal - * order - * - * The input is assumed to be in bitreversed order, and can - * have arbitrary coefficients in int16_t. - * - * The output polynomial is in normal order, and - * coefficient-wise bound by INVNTT_BOUND in absolute value. - * - * Arguments: - uint16_t *a: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) -); - -#define basemul_cached MLKEM_NAMESPACE(basemul_cached) -/************************************************************ - * Name: basemul_cached - * - * Description: Computes a representative modulo q of - * (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536 - * - * If b_cached is b1*zeta, this represents the - * product of (a0 + a1*X) and (b0 + b1*X) in - * Fq[X]/(X^2 - zeta). - * - * Arguments: - r: Pointer to output polynomial - * Upon return, coefficients are bound by - * 2*MLKEM_Q in absolute value. - * - a: Pointer to first input polynomial - * Every coefficient must be in [0..4095] - * - b: Pointer to second input polynomial - * Can have arbitrary int16_t coefficients - * - b_cached: Some precomputed value, typically derived from - * b1 and a twiddle factor. Can be an arbitary int16_t. - ************************************************************/ -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -__contract__( - requires(memory_no_alias(r, 2 * sizeof(int16_t))) - requires(memory_no_alias(a, 2 * sizeof(int16_t))) - requires(memory_no_alias(b, 2 * sizeof(int16_t))) - requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) - assigns(memory_slice(r, 2 * sizeof(int16_t))) - ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)) -); - -#endif /* NTT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/params.h index 57ea4c8ba..7f6c12625 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/params.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/params.h @@ -18,6 +18,7 @@ #define MLKEM_N 256 #define MLKEM_Q 3329 #define UINT12_LIMIT 4096 +#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define MLKEM_SSBYTES 32 /* size in bytes of shared key */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.c index 7483ebf6d..e8a2e2c6e 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.c @@ -8,388 +8,246 @@ #include #include #include "arith_backend.h" -#include "cbd.h" #include "cbmc.h" #include "debug.h" #include "fips202x4.h" -#include "ntt.h" #include "poly.h" -#include "reduce.h" +#include "sampling.h" #include "symmetric.h" #include "verify.h" -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 16))) - { - t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); - } - - r[i * 4] = t[0] | (t[1] << 4); - r[i * 4 + 1] = t[2] | (t[3] << 4); - r[i * 4 + 2] = t[4] | (t[5] << 4); - r[i * 4 + 3] = t[6] | (t[7] << 4); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - for (j = 0; j < MLKEM_N / 4; j++) - __loop__(invariant(j <= MLKEM_N / 4)) - { - unsigned k; - uint16_t t[4]; - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(forall(r, 0, k, t[r] < (1u << 10)))) - { - t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); - } - - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 10-bit in size. - */ - r[5 * j + 0] = (t[0] >> 0) & 0xFF; - r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); - r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); - r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); - r[5 * j + 4] = (t[3] >> 2); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) - { - r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); - r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) +#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) +#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) +#define fqmul MLKEM_NAMESPACE(fqmul) +#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) +#define basemul_cached MLKEM_NAMESPACE(basemul_cached) +#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) +#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) +#define ntt_layer MLKEM_NAMESPACE(ntt_layer) +#define invntt_layer MLKEM_NAMESPACE(invntt_layer) +/* End of static namespacing */ + +/************************************************* + * Name: cast_uint16_to_int16 + * + * Description: Cast uint16 value to int16 + * + * Returns: + * input x in 0 .. 32767: returns value unchanged + * input x in 32768 .. 65535: returns (x - 65536) + **************************************************/ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "conversion" +#endif +ALWAYS_INLINE +static INLINE int16_t cast_uint16_to_int16(uint16_t x) { - unsigned j; - for (j = 0; j < MLKEM_N / 4; j++) - __loop__( - invariant(j <= MLKEM_N / 4) - invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[4]; - uint8_t const *base = &a[5 * j]; - - t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); - t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); - t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); - - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) - { - r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + /* + * PORTABILITY: This relies on uint16_t -> int16_t + * being implemented as the inverse of int16_t -> uint16_t, + * which is implementation-defined (C99 6.3.1.3 (3)) + * CBMC (correctly) fails to prove this conversion is OK, + * so we have to suppress that check here + */ + return (int16_t)x; } -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ +#ifdef CBMC +#pragma CPROVER check pop +#endif -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +/************************************************* + * Name: montgomery_reduce_generic + * + * Description: Generic Montgomery reduction; given a 32-bit integer a, computes + * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 + * + * Arguments: - int32_t a: input integer to be reduced + * + * Returns: integer congruent to a * R^-1 modulo q, with absolute value + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 + * + **************************************************/ +ALWAYS_INLINE +static INLINE int16_t montgomery_reduce_generic(int32_t a) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ + const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 32))) - { - t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); - } + /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ + const uint16_t a_reduced = a & UINT16_MAX; + const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC, and use array indexing into - * r rather than pointer-arithmetic to simplify verification - */ - r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); - r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); - r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); - r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); - r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); - } -} + /* Lift to signed canonical representative mod 2^16. */ + const int16_t t = cast_uint16_to_int16(a_inverted); -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + int32_t r = a - ((int32_t)t * MLKEM_Q); + /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - for (j = 0; j < MLKEM_N / 8; j++) - __loop__(invariant(j <= MLKEM_N / 8)) - { - unsigned k; - uint16_t t[8]; - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(forall(r, 0, k, t[r] < (1u << 11)))) - { - t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); - } + /* + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + r = r >> 16; + /* Bounds: |r >> 16| <= ceil(|r| / 2^16) + * <= ceil(|a| / 2^16 + MLKEM_Q / 2) + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * + * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) + */ - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 11-bit in size. - */ - r[11 * j + 0] = (t[0] >> 0) & 0xFF; - r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); - r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); - r[11 * j + 3] = (t[2] >> 2) & 0xFF; - r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); - r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); - r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); - r[11 * j + 7] = (t[5] >> 1) & 0xFF; - r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); - r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); - r[11 * j + 10] = (t[7] >> 3); - } + return (int16_t)r; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +/************************************************* + * Name: montgomery_reduce + * + * Description: Montgomery reduction + * + * Arguments: - int32_t a: input integer to be reduced + * Must be smaller than 2 * 2^12 * 2^15 in absolute value. + * + * Returns: integer congruent to a * R^-1 modulo q, + * smaller than 2 * q in absolute value. + **************************************************/ +static INLINE int16_t montgomery_reduce(int32_t a) +__contract__( + requires(a > -(2 * UINT12_LIMIT * 32768)) + requires(a < (2 * UINT12_LIMIT * 32768)) + ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) +) { - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - uint8_t t[8]; - const unsigned offset = i * 5; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC and unwind loop for ease - * of proof. - */ - - /* - * Decompress 5 8-bit bytes (so 40 bits) into - * 8 5-bit values stored in t[] - */ - t[0] = 0x1F & (a[offset + 0] >> 0); - t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); - t[2] = 0x1F & (a[offset + 1] >> 2); - t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); - t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); - t[5] = 0x1F & (a[offset + 3] >> 1); - t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); - t[7] = 0x1F & (a[offset + 4] >> 3); - - /* and copy to the correct slice in r[] */ - for (j = 0; j < 8; j++) - __loop__( - invariant(j <= 8 && i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + int16_t res; + debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); + + res = montgomery_reduce_generic(a); + /* Bounds: + * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 + * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 + * < 2 * MLKEM_Q */ + + debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); + return res; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: fqmul + * + * Description: Montgomery multiplication modulo q=3329 + * + * Arguments: - int16_t a: first factor + * Can be any int16_t. + * - int16_t b: second factor. + * Must be signed canonical (abs value <(q+1)/2) + * + * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and + * smaller than q in absolute value. + * + **************************************************/ +static INLINE int16_t fqmul(int16_t a, int16_t b) +__contract__( + requires(b > -HALF_Q) + requires(b < HALF_Q) + ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) +) { - unsigned j; - for (j = 0; j < MLKEM_N / 8; j++) - __loop__( - invariant(j <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[8]; - uint8_t const *base = &a[11 * j]; - t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); - t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | - ((uint16_t)base[4] << 10)); - t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); - t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); - t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | - ((uint16_t)base[8] << 9)); - t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); - t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); - - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) - { - r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); - } - } + int16_t res; + debug_assert_abs_bound(&b, 1, HALF_Q); + + res = montgomery_reduce((int32_t)a * (int32_t)b); + /* Bounds: + * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 + * < MLKEM_Q + */ - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + debug_assert_abs_bound(&res, 1, MLKEM_Q); + return res; } -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ - -#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: barrett_reduce + * + * Description: Barrett reduction; given a 16-bit integer a, computes + * centered representative congruent to a mod q in + * {-(q-1)/2,...,(q-1)/2} + * + * Arguments: - int16_t a: input integer to be reduced + * + * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. + **************************************************/ +static INLINE int16_t barrett_reduce(int16_t a) +__contract__( + ensures(return_value > -HALF_Q && return_value < HALF_Q) +) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 2; i++) - __loop__(invariant(i <= MLKEM_N / 2)) - { - const uint16_t t0 = a->coeffs[2 * i]; - const uint16_t t1 = a->coeffs[2 * i + 1]; - /* - * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of - * significant data, so these can be packed into 24 bits or exactly - * 3 bytes, as follows. - */ - - /* Least significant bits 0 - 7 of t0. */ - r[3 * i + 0] = t0 & 0xFF; - - /* - * Most significant bits 8 - 11 of t0 become the least significant - * nibble of the second byte. The least significant 4 bits - * of t1 become the upper nibble of the second byte. - */ - r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + /* + * To divide by MLKEM_Q using Barrett multiplication, the "magic number" + * multiplier is round_to_nearest(2**26/MLKEM_Q) + */ + const int BPOWER = 26; + const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - /* Bits 4 - 11 of t1 become the third byte. */ - r[3 * i + 2] = t1 >> 4; - } -} -#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -{ - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - poly_tobytes_native(r, a); -} -#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + /* + * Compute round_to_nearest(a/MLKEM_Q) using the multiplier + * above and shift by BPOWER places. + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; -#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) - { - const uint8_t t0 = a[3 * i + 0]; - const uint8_t t1 = a[3 * i + 1]; - const uint8_t t2 = a[3 * i + 2]; - r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); - r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); - } + /* + * t is in -10 .. +10, so we need 32-bit math to + * evaluate t * MLKEM_Q and the subsequent subtraction + */ + int16_t res = (int16_t)(a - t * MLKEM_Q); - /* Note that the coefficients are not canonical */ - debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); -} -#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - poly_frombytes_native(r, a); + debug_assert_abs_bound(&res, 1, HALF_Q); + return res; } -#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ - -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], + int16_t b_cached) +__contract__( + requires(memory_no_alias(r, 2 * sizeof(int16_t))) + requires(memory_no_alias(a, 2 * sizeof(int16_t))) + requires(memory_no_alias(b, 2 * sizeof(int16_t))) + requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) + assigns(memory_slice(r, 2 * sizeof(int16_t))) + ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))) { - unsigned i; -#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) -#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" -#endif + int32_t t0, t1; + debug_assert_bound(a, 2, 0, UINT12_LIMIT); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - for (j = 0; j < 8; j++) - __loop__( - invariant(i < MLKEM_N / 8 && j <= 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - /* Prevent the compiler from recognizing this as a bit selection */ - uint8_t mask = value_barrier_u8(1u << j); - r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); - } - } - debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); -} + t0 = (int32_t)a[1] * b_cached; + t0 += (int32_t)a[0] * b[0]; + t1 = (int32_t)a[0] * b[1]; + t1 += (int32_t)a[1] * b[0]; -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* |ti| < 2 * q * 2^15 */ + r[0] = montgomery_reduce(t0); + r[1] = montgomery_reduce(t1); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - msg[i] = 0; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8)) - { - uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); - msg[i] |= t << j; - } - } + debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); } MLKEM_NATIVE_INTERNAL_API @@ -434,12 +292,46 @@ void poly_tomont(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_tomont(poly *r) { - poly_tomont_native(r); + poly_tomont_native(r->coeffs); debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) +/************************************************************ + * Name: scalar_signed_to_unsigned_q + * + * Description: converts signed polynomial coefficient + * from signed (-3328 .. 3328) form to + * unsigned form (0 .. 3328). + * + * Note: Cryptographic constant time implementation + * + * Examples: 0 -> 0 + * 1 -> 1 + * 3328 -> 3328 + * -1 -> 3328 + * -2 -> 3327 + * -3328 -> 1 + * + * Arguments: c: signed coefficient to be converted + ************************************************************/ +static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) +__contract__( + requires(c > -MLKEM_Q && c < MLKEM_Q) + ensures(return_value >= 0 && return_value < MLKEM_Q) + ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) +{ + debug_assert_abs_bound(&c, 1, MLKEM_Q); + + /* Add Q if c is negative, but in constant time */ + c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); + + /* and therefore cast to uint16_t is safe. */ + debug_assert_bound(&c, 1, 0, MLKEM_Q); + return (uint16_t)c; +} + MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { @@ -461,7 +353,7 @@ void poly_reduce(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { - poly_reduce_native(r); + poly_reduce_native(r->coeffs); debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ @@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a) MLKEM_NATIVE_INTERNAL_API void poly_mulcache_compute(poly_mulcache *x, const poly *a) { - poly_mulcache_compute_native(x, a); + poly_mulcache_compute_native(x->coeffs, a->coeffs); /* Omitting bounds assertion since native implementations may * decide not to use a mulcache. Note that the C backend implementation * of poly_basemul_montgomery_cached() does still include the check. */ } #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ +#if !defined(MLKEM_USE_NATIVE_NTT) +/* + * Computes a block CT butterflies with a fixed twiddle factor, + * using Montgomery multiplication. + * Parameters: + * - r: Pointer to base of polynomial (_not_ the base of butterfly block) + * - root: Twiddle factor to use for the butterfly. This must be in + * Montgomery form and signed canonical. + * - start: Offset to the beginning of the butterfly block + * - len: Index difference between coefficients subject to a butterfly + * - bound: Ghost variable describing coefficient bound: Prior to `start`, + * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, + * they must be bound by `bound`. + * When this function returns, output coefficients in the index range + * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. + * Example: + * - start=8, len=4 + * This would compute the following four butterflies + * 8 -- 12 + * 9 -- 13 + * 10 -- 14 + * 11 -- 15 + * - start=4, len=2 + * This would compute the following two butterflies + * 4 -- 6 + * 5 -- 7 + */ +static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, + unsigned start, unsigned len, int bound) +__contract__( + requires(start < MLKEM_N) + requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) + requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) + requires(-HALF_Q < zeta && zeta < HALF_Q) + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) + requires(array_abs_bound(r, start, MLKEM_N, bound)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) + ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) +{ + /* `bound` is a ghost variable only needed in the CBMC specification */ + unsigned j; + ((void)bound); + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + /* + * Coefficients are updated in strided pairs, so the bounds for the + * intermediate states alternate twice between the old and new bound + */ + invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j, start + len, bound)) + invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) + { + int16_t t; + t = fqmul(r[j + len], zeta); + r[j + len] = r[j] - t; + r[j] = r[j] + t; + } +} + +/* + *Compute one layer of forward NTT + * Parameters: + * - r: Pointer to base of polynomial + * - len: Stride of butterflies in this layer. + * - layer: Ghost variable indicating which layer is being applied. + * Must match `len` via `len == MLKEM_N >> layer`. + * Note: `len` could be dropped and computed in the function, but + * we are following the structure of the reference NTT from the + * official Kyber implementation here, merely adding `layer` as + * a ghost variable for the specifications. + */ +static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) + requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable only needed in the CBMC specification */ + ((void)layer); + /* Twiddle factors for layer n start at index 2^(layer-1) */ + k = MLKEM_N / (2 * len); + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(start < MLKEM_N + 2 * len) + invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) + invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) + invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) + { + int16_t zeta = zetas[k++]; + ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); + } +} + +/* + * Compute full forward NTT + * NOTE: This particular implementation satisfies a much tighter + * bound on the output coefficients (5*q) than the contractual one (8*q), + * but this is not needed in the calling code. Should we change the + * base multiplication strategy to require smaller NTT output bounds, + * the proof may need strengthening. + */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + unsigned len, layer; + int16_t *r; + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + r = p->coeffs; + + for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) + __loop__( + invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) + invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) + { + ntt_layer(r, len, layer); + } + + /* Check the stronger bound */ + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_NTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + ntt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_NTT */ + +#if !defined(MLKEM_USE_NATIVE_INTT) + +/* Compute one layer of inverse NTT */ +static void invntt_layer(int16_t *r, unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) + requires(len == (1 << (8 - layer))) + requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable used only in the specification */ + ((void)layer); + k = MLKEM_N / len - 1; + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + invariant(start <= MLKEM_N && k <= 127) + /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ + invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) + { + unsigned j; + int16_t zeta = zetas[k--]; + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + invariant(start <= MLKEM_N && k <= 127) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + int16_t t = r[j]; + r[j] = barrett_reduce(t + r[j + len]); + r[j + len] = r[j + len] - t; + r[j + len] = fqmul(r[j + len], zeta); + } + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + /* + * Scale input polynomial to account for Montgomery factor + * and NTT twist. This also brings coefficients down to + * absolute value < MLKEM_Q. + */ + unsigned j, len, layer; + const int16_t f = 1441; + int16_t *r = p->coeffs; + + for (j = 0; j < MLKEM_N; j++) + __loop__( + invariant(j <= MLKEM_N) + invariant(array_abs_bound(r, 0, j, MLKEM_Q))) + { + r[j] = fqmul(r[j], f); + } + + /* Run the invNTT layers */ + for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) + __loop__( + invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + invntt_layer(p->coeffs, len, layer); + } + + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_INTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + intt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_INTT */ + #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.h index 6a14c785d..cb0d67c1a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.h @@ -9,7 +9,7 @@ #include #include "cbmc.h" #include "common.h" -#include "reduce.h" +#include "debug.h" #include "verify.h" /* Absolute exclusive upper bound for the output of the inverse NTT */ @@ -18,6 +18,9 @@ /* Absolute exclusive upper bound for the output of the forward NTT */ #define NTT_BOUND (8 * MLKEM_Q) +#define zetas MLKEM_NAMESPACE(zetas) +extern const int16_t zetas[128]; + /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1] @@ -38,520 +41,6 @@ typedef struct int16_t coeffs[MLKEM_N >> 1]; } poly_mulcache; -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) -#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) -#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) -#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) -#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) -#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) -#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) -#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) -#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) -#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) -/* End of static namespacing */ - -/************************************************************ - * Name: scalar_compress_d1 - * - * Description: Computes round(u * 2 / q) - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 1. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d1(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 2) - ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) -{ - uint32_t d0 = u << 1; - d0 *= 645083; - d0 += 1u << 30; - d0 >>= 31; - return d0; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_compress_d4 - * - * Description: Computes round(u * 16 / q) % 16 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d4(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 16) - ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) -{ - uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ - return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d4 - * - * Description: Computes round(u * q / 16) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d4(uint32_t u) -__contract__( - requires(0 <= u && u < 16) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 8) / 16; } - -/************************************************************ - * Name: scalar_compress_d5 - * - * Description: Computes round(u * 32 / q) % 32 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d5(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 32) - ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) -{ - uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ - return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d5 - * - * Description: Computes round(u * q / 32) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo 32 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d5(uint32_t u) -__contract__( - requires(0 <= u && u < 32) - ensures(return_value <= MLKEM_Q - 1) -) { return ((u * MLKEM_Q) + 16) / 32; } - -/************************************************************ - * Name: scalar_compress_d10 - * - * Description: Computes round(u * 2**10 / q) % 2**10 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d10(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 10)) - ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) -{ - uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x3FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d10 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d10(uint32_t u) -__contract__( - requires(0 <= u && u < 1024) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 512) / 1024; } - -/************************************************************ - * Name: scalar_compress_d11 - * - * Description: Computes round(u * 2**11 / q) % 2**11 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 11. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d11(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 11)) - ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) -{ - uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x7FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d11 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d11(uint32_t u) -__contract__( - requires(0 <= u && u < 2048) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 1024) / 2048; } - -/************************************************************ - * Name: scalar_signed_to_unsigned_q - * - * Description: converts signed polynomial coefficient - * from signed (-3328 .. 3328) form to - * unsigned form (0 .. 3328). - * - * Note: Cryptographic constant time implementation - * - * Examples: 0 -> 0 - * 1 -> 1 - * 3328 -> 3328 - * -1 -> 3328 - * -2 -> 3327 - * -3328 -> 1 - * - * Arguments: c: signed coefficient to be converted - ************************************************************/ -static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) -__contract__( - requires(c > -MLKEM_Q && c < MLKEM_Q) - ensures(return_value >= 0 && return_value < MLKEM_Q) - ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) -{ - debug_assert_abs_bound(&c, 1, MLKEM_Q); - - /* Add Q if c is negative, but in constant time */ - c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); - - /* and therefore cast to uint16_t is safe. */ - debug_assert_bound(&c, 1, 0, MLKEM_Q); - return (uint16_t)c; -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ - (MLKEM_K == 2 || MLKEM_K == 3) -#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) -/************************************************* - * Name: poly_compress_d4 - * - * Description: Compression (4 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); - -#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) -/************************************************* - * Name: poly_compress_d10 - * - * Description: Compression (10 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); - -#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) -/************************************************* - * Name: poly_decompress_d4 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); - -#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) -/************************************************* - * Name: poly_decompress_d10 - * - * Description: De-serialization and subsequent decompression (10 bits) of a - * polynomial; approximate inverse of poly_compress_d10 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) -/************************************************* - * Name: poly_compress_d5 - * - * Description: Compression (5 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); - -#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) -/************************************************* - * Name: poly_compress_d11 - * - * Description: Compression (11 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); - -#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) -/************************************************* - * Name: poly_decompress_d5 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); - -#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) -/************************************************* - * Name: poly_decompress_d11 - * - * Description: De-serialization and subsequent decompression (11 bits) of a - * polynomial; approximate inverse of poly_compress_d11 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ - */ - -#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) -/************************************************* - * Name: poly_tobytes - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - a: const pointer to input polynomial, - * with each coefficient in the range [0,1,..,Q-1] - * OUTPUT - * - r: pointer to output byte array - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYBYTES)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(r)) -); - - -#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) -/************************************************* - * Name: poly_frombytes - * - * Description: De-serialization of a polynomial. - * - * Arguments: INPUT - * - a: pointer to input byte array - * (of MLKEM_POLYBYTES bytes) - * OUTPUT - * - r: pointer to output polynomial, with - * each coefficient unsigned and in the range - * 0 .. 4095 - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) -); - - -#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) -/************************************************* - * Name: poly_frommsg - * - * Description: Convert 32-byte message to polynomial - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *msg: pointer to input message - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(object_whole(r)) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) -); - -#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) -/************************************************* - * Name: poly_tomsg - * - * Description: Convert polynomial to 32-byte message - * - * Arguments: - uint8_t *msg: pointer to output message - * - const poly *r: pointer to input polynomial - * Coefficients must be unsigned canonical - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(msg)) -); - #define poly_basemul_montgomery_cached \ MLKEM_NAMESPACE(poly_basemul_montgomery_cached) /************************************************* @@ -715,4 +204,56 @@ __contract__( assigns(object_whole(r)) ); +#define poly_ntt MLKEM_NAMESPACE(poly_ntt) +/************************************************* + * Name: poly_ntt + * + * Description: Computes negacyclic number-theoretic transform (NTT) of + * a polynomial in place. + * + * The input is assumed to be in normal order and + * coefficient-wise bound by MLKEM_Q in absolute value. + * + * The output polynomial is in bitreversed order, and + * coefficient-wise bound by NTT_BOUND in absolute value. + * + * (NOTE: Sometimes the input to the NTT is actually smaller, + * which gives better bounds.) + * + * Arguments: - poly *p: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) +); + +#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) +/************************************************* + * Name: poly_invntt_tomont + * + * Description: Computes inverse of negacyclic number-theoretic transform (NTT) + * of a polynomial in place; + * inputs assumed to be in bitreversed order, output in normal + * order + * + * The input is assumed to be in bitreversed order, and can + * have arbitrary coefficients in int16_t. + * + * The output polynomial is in normal order, and + * coefficient-wise bound by INVNTT_BOUND in absolute value. + * + * Arguments: - uint16_t *a: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) +); + #endif /* POLY_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly_k.c similarity index 97% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.c rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly_k.c index 50ea1c34a..c2d330ea9 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly_k.c @@ -2,13 +2,12 @@ * Copyright (c) 2024 The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 */ -#include "polyvec.h" +#include "poly_k.h" #include #include #include "arith_backend.h" -#include "cbd.h" -#include "ntt.h" -#include "poly.h" +#include "compress.h" +#include "sampling.h" #include "symmetric.h" #include "debug.h" @@ -131,7 +130,9 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, /* Omitting bounds assertion for cache since native implementations may * decide not to use a mulcache. Note that the C backend implementation * of poly_basemul_montgomery_cached() does still include the check. */ - polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache); + polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a, + (const int16_t *)b, + (const int16_t *)b_cache); } #endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly_k.h similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly_k.h index 8be8579e0..0aea95912 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly_k.h @@ -2,11 +2,12 @@ * Copyright (c) 2024 The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 */ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef POLY_K_H +#define POLY_K_H #include #include "common.h" +#include "compress.h" #include "poly.h" #define polyvec MLKEM_NAMESPACE_K(polyvec) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/reduce.h deleted file mode 100644 index b432a4201..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/reduce.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef REDUCE_H -#define REDUCE_H - -#include -#include "cbmc.h" -#include "common.h" -#include "debug.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) -#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) -#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) -#define fqmul MLKEM_NAMESPACE(fqmul) -#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) -/* End of static namespacing */ - -#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ - -/************************************************* - * Name: cast_uint16_to_int16 - * - * Description: Cast uint16 value to int16 - * - * Returns: - * input x in 0 .. 32767: returns value unchanged - * input x in 32768 .. 65535: returns (x - 65536) - **************************************************/ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "conversion" -#endif -ALWAYS_INLINE -static INLINE int16_t cast_uint16_to_int16(uint16_t x) -{ - /* - * PORTABILITY: This relies on uint16_t -> int16_t - * being implemented as the inverse of int16_t -> uint16_t, - * which is implementation-defined (C99 6.3.1.3 (3)) - * CBMC (correctly) fails to prove this conversion is OK, - * so we have to suppress that check here - */ - return (int16_t)x; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************* - * Name: montgomery_reduce_generic - * - * Description: Generic Montgomery reduction; given a 32-bit integer a, computes - * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 - * - * Arguments: - int32_t a: input integer to be reduced - * - * Returns: integer congruent to a * R^-1 modulo q, with absolute value - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 - * - **************************************************/ -ALWAYS_INLINE -static INLINE int16_t montgomery_reduce_generic(int32_t a) -{ - /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ - const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - - /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ - const uint16_t a_reduced = a & UINT16_MAX; - const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - - /* Lift to signed canonical representative mod 2^16. */ - const int16_t t = cast_uint16_to_int16(a_inverted); - - int32_t r = a - ((int32_t)t * MLKEM_Q); - /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - - /* - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - r = r >> 16; - /* Bounds: |r >> 16| <= ceil(|r| / 2^16) - * <= ceil(|a| / 2^16 + MLKEM_Q / 2) - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * - * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) - */ - - return (int16_t)r; -} - -/************************************************* - * Name: montgomery_reduce - * - * Description: Montgomery reduction - * - * Arguments: - int32_t a: input integer to be reduced - * Must be smaller than 2 * 2^12 * 2^15 in absolute value. - * - * Returns: integer congruent to a * R^-1 modulo q, - * smaller than 2 * q in absolute value. - **************************************************/ -static INLINE int16_t montgomery_reduce(int32_t a) -__contract__( - requires(a > -(2 * UINT12_LIMIT * 32768)) - requires(a < (2 * UINT12_LIMIT * 32768)) - ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); - - res = montgomery_reduce_generic(a); - /* Bounds: - * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 - * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 - * < 2 * MLKEM_Q */ - - debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); - return res; -} - -/************************************************* - * Name: fqmul - * - * Description: Montgomery multiplication modulo q=3329 - * - * Arguments: - int16_t a: first factor - * Can be any int16_t. - * - int16_t b: second factor. - * Must be signed canonical (abs value <(q+1)/2) - * - * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and - * smaller than q in absolute value. - * - **************************************************/ -static INLINE int16_t fqmul(int16_t a, int16_t b) -__contract__( - requires(b > -HALF_Q) - requires(b < HALF_Q) - ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&b, 1, HALF_Q); - - res = montgomery_reduce((int32_t)a * (int32_t)b); - /* Bounds: - * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 - * < MLKEM_Q - */ - - debug_assert_abs_bound(&res, 1, MLKEM_Q); - return res; -} - -/************************************************* - * Name: barrett_reduce - * - * Description: Barrett reduction; given a 16-bit integer a, computes - * centered representative congruent to a mod q in - * {-(q-1)/2,...,(q-1)/2} - * - * Arguments: - int16_t a: input integer to be reduced - * - * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. - **************************************************/ -static INLINE int16_t barrett_reduce(int16_t a) -__contract__( - ensures(return_value > -HALF_Q && return_value < HALF_Q) -) -{ - /* - * To divide by MLKEM_Q using Barrett multiplication, the "magic number" - * multiplier is round_to_nearest(2**26/MLKEM_Q) - */ - const int BPOWER = 26; - const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - - /* - * Compute round_to_nearest(a/MLKEM_Q) using the multiplier - * above and shift by BPOWER places. - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; - - /* - * t is in -10 .. +10, so we need 32-bit math to - * evaluate t * MLKEM_Q and the subsequent subtraction - */ - int16_t res = (int16_t)(a - t * MLKEM_Q); - - debug_assert_abs_bound(&res, 1, HALF_Q); - return res; -} - -#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/sampling.c similarity index 73% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.c rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/sampling.c index cbbe4407f..98cbdcb74 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/sampling.c @@ -9,7 +9,7 @@ #include "debug.h" #include "fips202.h" #include "fips202x4.h" -#include "rej_uniform.h" +#include "sampling.h" #include "symmetric.h" /* Static namespacing @@ -18,6 +18,8 @@ * within a single compilation unit. */ #define rej_uniform MLKEM_NAMESPACE(rej_uniform) #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar) +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) /* End of static namespacing */ static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target, @@ -233,9 +235,113 @@ void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) xof_release(&state); } +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) +/* End of static namespacing */ + +/************************************************* + * Name: load32_littleendian + * + * Description: load 4 bytes into a 32-bit integer + * in little-endian order + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x + **************************************************/ +static uint32_t load32_littleendian(const uint8_t x[4]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + r |= (uint32_t)x[3] << 24; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) + { + unsigned j; + uint32_t t = load32_littleendian(buf + 4 * i); + uint32_t d = t & 0x55555555; + d += (t >> 1) & 0x55555555; + + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) + { + const int16_t a = (d >> (4 * j + 0)) & 0x3; + const int16_t b = (d >> (4 * j + 2)) & 0x3; + r->coeffs[8 * i + j] = a - b; + } + } +} + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +/************************************************* + * Name: load24_littleendian + * + * Description: load 3 bytes into a 32-bit integer + * in little-endian order. + * This function is only needed for ML-KEM-512 + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) + **************************************************/ +static uint32_t load24_littleendian(const uint8_t x[3]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 4; i++) + __loop__( + invariant(i <= MLKEM_N / 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) + { + unsigned j; + const uint32_t t = load24_littleendian(buf + 3 * i); + uint32_t d = t & 0x00249249; + d += (t >> 1) & 0x00249249; + d += (t >> 2) & 0x00249249; + + for (j = 0; j < 4; j++) + __loop__( + invariant(i <= MLKEM_N / 4 && j <= 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) + { + const int16_t a = (d >> (6 * j + 0)) & 0x7; + const int16_t b = (d >> (6 * j + 3)) & 0x7; + r->coeffs[4 * i + j] = a - b; + } + } +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ + 3 */ + #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ -#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform) -int empty_cu_rej_uniform; +#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling) +int empty_cu_sampling; #endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/sampling.h similarity index 63% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/sampling.h index 801287259..cc524e0fc 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/sampling.h @@ -2,8 +2,8 @@ * Copyright (c) 2024 The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 */ -#ifndef REJ_UNIFORM_H -#define REJ_UNIFORM_H +#ifndef SAMPLING_H +#define SAMPLING_H #include #include @@ -11,6 +11,37 @@ #include "common.h" #include "poly.h" +#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) +/************************************************* + * Name: poly_cbd2 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=2 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) +/************************************************* + * Name: poly_cbd3 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=3. + * This function is only needed for ML-KEM-512 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ + #define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4) /************************************************* * Name: poly_rej_uniform_x4 @@ -60,4 +91,4 @@ __contract__( assigns(memory_slice(entry, sizeof(poly))) ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q))); -#endif /* REJ_UNIFORM_H */ +#endif /* SAMPLING_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c index 4ef887c62..987f0dce4 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c @@ -10,7 +10,7 @@ #include "common.h" #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) -#include "ntt.h" +#include "poly.h" /* * Table of zeta values used in the reference NTT and inverse NTT. diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_backend.h index 0543b1bd1..ade31cda1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_backend.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_backend.h @@ -17,7 +17,7 @@ * Keep this _after_ the inclusion of the backend; otherwise, * the sanity checks won't have an effect. */ #if defined(MLKEM_NATIVE_CHECK_APIS) -#include "api.h" +#include "native/api.h" #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c deleted file mode 100644 index 1e6b7c5d1..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED - -#include -#include "cbd.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) -#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) -/* End of static namespacing */ - -/************************************************* - * Name: load32_littleendian - * - * Description: load 4 bytes into a 32-bit integer - * in little-endian order - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x - **************************************************/ -static uint32_t load32_littleendian(const uint8_t x[4]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - r |= (uint32_t)x[3] << 24; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) - { - unsigned j; - uint32_t t = load32_littleendian(buf + 4 * i); - uint32_t d = t & 0x55555555; - d += (t >> 1) & 0x55555555; - - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) - { - const int16_t a = (d >> (4 * j + 0)) & 0x3; - const int16_t b = (d >> (4 * j + 2)) & 0x3; - r->coeffs[8 * i + j] = a - b; - } - } -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -/************************************************* - * Name: load24_littleendian - * - * Description: load 3 bytes into a 32-bit integer - * in little-endian order. - * This function is only needed for ML-KEM-512 - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) - **************************************************/ -static uint32_t load24_littleendian(const uint8_t x[3]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 4; i++) - __loop__( - invariant(i <= MLKEM_N / 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) - { - unsigned j; - const uint32_t t = load24_littleendian(buf + 3 * i); - uint32_t d = t & 0x00249249; - d += (t >> 1) & 0x00249249; - d += (t >> 2) & 0x00249249; - - for (j = 0; j < 4; j++) - __loop__( - invariant(i <= MLKEM_N / 4 && j <= 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) - { - const int16_t a = (d >> (6 * j + 0)) & 0x7; - const int16_t b = (d >> (6 * j + 3)) & 0x7; - r->coeffs[4 * i + j] = a - b; - } - } -} -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ - 3 */ - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd) -int empty_cu_cbd; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h deleted file mode 100644 index 54c1f5b90..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef CBD_H -#define CBD_H - -#include -#include "common.h" -#include "poly.h" - -#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) -/************************************************* - * Name: poly_cbd2 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=2 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) -/************************************************* - * Name: poly_cbd3 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=3. - * This function is only needed for ML-KEM-512 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ - -#endif /* CBD_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h index 4f326333e..62ed53ab1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h @@ -15,12 +15,19 @@ #include "sys.h" /* Include backend metadata */ -#if defined(MLKEM_USE_NATIVE) -#if defined(MLKEM_NATIVE_ARITH_BACKEND) -#include MLKEM_NATIVE_ARITH_BACKEND +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#include MLKEM_NATIVE_ARITH_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not. +#endif #endif -#if defined(MLKEM_NATIVE_FIPS202_BACKEND) -#include MLKEM_NATIVE_FIPS202_BACKEND + +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#include MLKEM_NATIVE_FIPS202_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not. #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/compress.c new file mode 100644 index 000000000..a03fe0ac4 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/compress.c @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "common.h" +#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) + +#include +#include +#include "arith_backend.h" +#include "cbmc.h" +#include "compress.h" +#include "debug.h" +#include "verify.h" + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 16))) + { + t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); + } + + r[i * 4] = t[0] | (t[1] << 4); + r[i * 4 + 1] = t[2] | (t[3] << 4); + r[i * 4 + 2] = t[4] | (t[5] << 4); + r[i * 4 + 3] = t[6] | (t[7] << 4); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + for (j = 0; j < MLKEM_N / 4; j++) + __loop__(invariant(j <= MLKEM_N / 4)) + { + unsigned k; + uint16_t t[4]; + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(forall(r, 0, k, t[r] < (1u << 10)))) + { + t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 10-bit in size. + */ + r[5 * j + 0] = (t[0] >> 0) & 0xFF; + r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); + r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); + r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); + r[5 * j + 4] = (t[3] >> 2); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) + { + r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); + r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 4; j++) + __loop__( + invariant(j <= MLKEM_N / 4) + invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[4]; + uint8_t const *base = &a[5 * j]; + + t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); + t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); + t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); + + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) + { + r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 32))) + { + t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); + } + + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC, and use array indexing into + * r rather than pointer-arithmetic to simplify verification + */ + r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); + r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); + r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); + r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); + r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (j = 0; j < MLKEM_N / 8; j++) + __loop__(invariant(j <= MLKEM_N / 8)) + { + unsigned k; + uint16_t t[8]; + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(forall(r, 0, k, t[r] < (1u << 11)))) + { + t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 11-bit in size. + */ + r[11 * j + 0] = (t[0] >> 0) & 0xFF; + r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); + r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); + r[11 * j + 3] = (t[2] >> 2) & 0xFF; + r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); + r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); + r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); + r[11 * j + 7] = (t[5] >> 1) & 0xFF; + r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); + r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); + r[11 * j + 10] = (t[7] >> 3); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + uint8_t t[8]; + const unsigned offset = i * 5; + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC and unwind loop for ease + * of proof. + */ + + /* + * Decompress 5 8-bit bytes (so 40 bits) into + * 8 5-bit values stored in t[] + */ + t[0] = 0x1F & (a[offset + 0] >> 0); + t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); + t[2] = 0x1F & (a[offset + 1] >> 2); + t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); + t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); + t[5] = 0x1F & (a[offset + 3] >> 1); + t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); + t[7] = 0x1F & (a[offset + 4] >> 3); + + /* and copy to the correct slice in r[] */ + for (j = 0; j < 8; j++) + __loop__( + invariant(j <= 8 && i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 8; j++) + __loop__( + invariant(j <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[8]; + uint8_t const *base = &a[11 * j]; + t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); + t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | + ((uint16_t)base[4] << 10)); + t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); + t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); + t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | + ((uint16_t)base[8] << 9)); + t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); + t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); + + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) + { + r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ + +#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 2; i++) + __loop__(invariant(i <= MLKEM_N / 2)) + { + const uint16_t t0 = a->coeffs[2 * i]; + const uint16_t t1 = a->coeffs[2 * i + 1]; + /* + * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of + * significant data, so these can be packed into 24 bits or exactly + * 3 bytes, as follows. + */ + + /* Least significant bits 0 - 7 of t0. */ + r[3 * i + 0] = t0 & 0xFF; + + /* + * Most significant bits 8 - 11 of t0 become the least significant + * nibble of the second byte. The least significant 4 bits + * of t1 become the upper nibble of the second byte. + */ + r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + + /* Bits 4 - 11 of t1 become the third byte. */ + r[3 * i + 2] = t1 >> 4; + } +} +#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + poly_tobytes_native(r, a->coeffs); +} +#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + +#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) + { + const uint8_t t0 = a[3 * i + 0]; + const uint8_t t1 = a[3 * i + 1]; + const uint8_t t2 = a[3 * i + 2]; + r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); + r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); + } + + /* Note that the coefficients are not canonical */ + debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); +} +#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + poly_frombytes_native(r->coeffs, a); +} +#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ + +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +{ + unsigned i; +#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) +#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" +#endif + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + for (j = 0; j < 8; j++) + __loop__( + invariant(i < MLKEM_N / 8 && j <= 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + /* Prevent the compiler from recognizing this as a bit selection */ + uint8_t mask = value_barrier_u8(1u << j); + r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); + } + } + debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + msg[i] = 0; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8)) + { + uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); + msg[i] |= t << j; + } + } +} + +#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ + +#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress) +int empty_cu_compress; + +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/compress.h new file mode 100644 index 000000000..409dbe519 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/compress.h @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef COMPRESS_H +#define COMPRESS_H + +#include +#include +#include "cbmc.h" +#include "common.h" +#include "debug.h" +#include "poly.h" +#include "verify.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) +#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) +#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) +#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) +#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) +#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) +#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) +#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) +#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) +/* End of static namespacing */ + +/************************************************************ + * Name: scalar_compress_d1 + * + * Description: Computes round(u * 2 / q) + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 1. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d1(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 2) + ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) +{ + uint32_t d0 = u << 1; + d0 *= 645083; + d0 += 1u << 30; + d0 >>= 31; + return d0; +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_compress_d4 + * + * Description: Computes round(u * 16 / q) % 16 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d4(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 16) + ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) +{ + uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ + return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d4 + * + * Description: Computes round(u * q / 16) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d4(uint32_t u) +__contract__( + requires(0 <= u && u < 16) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 8) / 16; } + +/************************************************************ + * Name: scalar_compress_d5 + * + * Description: Computes round(u * 32 / q) % 32 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d5(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 32) + ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) +{ + uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ + return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d5 + * + * Description: Computes round(u * q / 32) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo 32 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d5(uint32_t u) +__contract__( + requires(0 <= u && u < 32) + ensures(return_value <= MLKEM_Q - 1) +) { return ((u * MLKEM_Q) + 16) / 32; } + +/************************************************************ + * Name: scalar_compress_d10 + * + * Description: Computes round(u * 2**10 / q) % 2**10 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d10(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 10)) + ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) +{ + uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x3FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d10 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d10(uint32_t u) +__contract__( + requires(0 <= u && u < 1024) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 512) / 1024; } + +/************************************************************ + * Name: scalar_compress_d11 + * + * Description: Computes round(u * 2**11 / q) % 2**11 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 11. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d11(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 11)) + ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) +{ + uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x7FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d11 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d11(uint32_t u) +__contract__( + requires(0 <= u && u < 2048) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 1024) / 2048; } + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ + (MLKEM_K == 2 || MLKEM_K == 3) +#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) +/************************************************* + * Name: poly_compress_d4 + * + * Description: Compression (4 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); + +#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) +/************************************************* + * Name: poly_compress_d10 + * + * Description: Compression (10 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); + +#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) +/************************************************* + * Name: poly_decompress_d4 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); + +#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) +/************************************************* + * Name: poly_decompress_d10 + * + * Description: De-serialization and subsequent decompression (10 bits) of a + * polynomial; approximate inverse of poly_compress_d10 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) +/************************************************* + * Name: poly_compress_d5 + * + * Description: Compression (5 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); + +#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) +/************************************************* + * Name: poly_compress_d11 + * + * Description: Compression (11 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); + +#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) +/************************************************* + * Name: poly_decompress_d5 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); + +#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) +/************************************************* + * Name: poly_decompress_d11 + * + * Description: De-serialization and subsequent decompression (11 bits) of a + * polynomial; approximate inverse of poly_compress_d11 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ + */ + +#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) +/************************************************* + * Name: poly_tobytes + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - a: const pointer to input polynomial, + * with each coefficient in the range [0,1,..,Q-1] + * OUTPUT + * - r: pointer to output byte array + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYBYTES)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(r)) +); + + +#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) +/************************************************* + * Name: poly_frombytes + * + * Description: De-serialization of a polynomial. + * + * Arguments: INPUT + * - a: pointer to input byte array + * (of MLKEM_POLYBYTES bytes) + * OUTPUT + * - r: pointer to output polynomial, with + * each coefficient unsigned and in the range + * 0 .. 4095 + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) +); + + +#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) +/************************************************* + * Name: poly_frommsg + * + * Description: Convert 32-byte message to polynomial + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *msg: pointer to input message + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(object_whole(r)) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) +); + +#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) +/************************************************* + * Name: poly_tomsg + * + * Description: Convert polynomial to 32-byte message + * + * Arguments: - uint8_t *msg: pointer to output message + * - const poly *r: pointer to input polynomial + * Coefficients must be unsigned canonical + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(msg)) +); + +#endif /* COMPRESS_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h index fa89370ce..e975ede95 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h @@ -122,46 +122,87 @@ /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ /****************************************************************************** - * Name: MLKEM_USE_NATIVE + * Name: MLKEM_USE_NATIVE_BACKEND_ARITH * - * Description: Determines whether a native backend should - * be used, if available. + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. * * This can also be set using CFLAGS. * *****************************************************************************/ -#if !defined(MLKEM_USE_NATIVE) -/* #define MLKEM_USE_NATIVE */ +#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */ #endif /****************************************************************************** - * Name: MLKEM_NATIVE_ARITH_BACKEND + * Name: MLKEM_NATIVE_ARITH_BACKEND_FILE * * Description: The arithmetic backend to use. * - * This must be the filename of an arithmetic backend. - * See the existing backends for examples. + * If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND) -#define MLKEM_NATIVE_ARITH_BACKEND "default.h" -#endif /* MLKEM_NATIVE_ARITH_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \ + !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h" +#endif /****************************************************************************** - * Name: MLKEM_NATIVE_FIPS202_BACKEND + * Name: MLKEM_USE_NATIVE_BACKEND_FIPS202 + * + * Description: Determines whether an native FIPS202 backend should be used. + * + * The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is + * the performance bottleneck of SHA3 and SHAKE. + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the FIPS202 backend to be use is + * determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */ +#endif + +/****************************************************************************** + * Name: MLKEM_NATIVE_FIPS202_BACKEND_FILE * * Description: The FIPS-202 backend to use. * - * This must be the filename of an FIPS-202 backend. + * If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must + * either be undefined or the filename of a FIPS202 backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND) -#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h" -#endif /* MLKEM_NATIVE_FIPS202_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \ + !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h" +#endif /************************* Config internals ********************************/ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c index 0cfcc3e9e..318d0fc77 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c @@ -9,11 +9,10 @@ #include "fips202.h" #include "fips202x4.h" #include "indcpa.h" -#include "ntt.h" #include "poly.h" -#include "polyvec.h" +#include "poly_k.h" #include "randombytes.h" -#include "rej_uniform.h" +#include "sampling.h" #include "symmetric.h" #include "arith_backend.h" @@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v, #define poly_permute_bitrev_to_custom \ MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom) -static INLINE void poly_permute_bitrev_to_custom(poly *data) +static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N]) __contract__( /* We don't specify that this should be a permutation, but only * that it does not change the bound established at the end of gen_matrix. */ - requires(memory_no_alias(data, sizeof(poly))) - requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N)) + requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q)) assigns(memory_slice(data, sizeof(poly))) - ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } + ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ /* Not static for benchmarking */ @@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed) { for (j = 0; j < MLKEM_K; j++) { - poly_permute_bitrev_to_custom(&a[i].vec[j]); + poly_permute_bitrev_to_custom(a[i].vec[j].coeffs); } } } diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h index 2c4fda3c4..b4d5985bf 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h @@ -8,7 +8,7 @@ #include #include "cbmc.h" #include "common.h" -#include "polyvec.h" +#include "poly_k.h" #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix) /************************************************* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/api.h similarity index 90% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/api.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/api.h index 792ecb8a4..0704f9dcd 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/api.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/api.h @@ -23,8 +23,7 @@ #define MLKEM_NATIVE_ARITH_NATIVE_API_H #include -#include "poly.h" -#include "polyvec.h" +#include "../common.h" /* * This is the C<->native interface allowing for the drop-in of @@ -65,9 +64,9 @@ * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER * for more information. * - * Arguments: - poly *p: pointer to in/output polynomial + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial **************************************************/ -static INLINE void ntt_native(poly *); +static INLINE void ntt_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_NTT */ #if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER) @@ -96,10 +95,10 @@ and to/from bytes conversions." * * This must only be defined if there is native code for * all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache. - * Arguments: - poly *p: pointer to in/output polynomial + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial * **************************************************/ -static INLINE void poly_permute_bitrev_to_custom(poly *); +static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ #if defined(MLKEM_USE_NATIVE_INTT) @@ -117,7 +116,7 @@ static INLINE void poly_permute_bitrev_to_custom(poly *); * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -static INLINE void intt_native(poly *); +static INLINE void intt_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_INTT */ #if defined(MLKEM_USE_NATIVE_POLY_REDUCE) @@ -126,9 +125,9 @@ static INLINE void intt_native(poly *); * * Description: Applies modular reduction to all coefficients of a polynomial. * - * Arguments: - poly *r: pointer to input/output polynomial + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial **************************************************/ -static INLINE void poly_reduce_native(poly *); +static INLINE void poly_reduce_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ #if defined(MLKEM_USE_NATIVE_POLY_TOMONT) @@ -138,9 +137,9 @@ static INLINE void poly_reduce_native(poly *); * Description: Inplace conversion of all coefficients of a polynomial * from normal domain to Montgomery domain * - * Arguments: - poly *r: pointer to input/output polynomial + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial **************************************************/ -static INLINE void poly_tomont_native(poly *); +static INLINE void poly_tomont_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ #if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) @@ -165,8 +164,8 @@ static INLINE void poly_tomont_native(poly *); * OUTPUT * - cache: pointer to multiplication cache **************************************************/ -static INLINE void poly_mulcache_compute_native(poly_mulcache *cache, - const poly *poly); +static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2], + const int16_t poly[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ #if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) @@ -189,8 +188,9 @@ static INLINE void poly_mulcache_compute_native(poly_mulcache *cache, * in NTT domain, and of the same order as a and b. **************************************************/ static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache); + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]); #endif #if defined(MLKEM_USE_NATIVE_POLY_TOBYTES) @@ -209,7 +209,7 @@ static INLINE void polyvec_basemul_acc_montgomery_cached_native( * (of MLKEM_POLYBYTES bytes) **************************************************/ static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a); + const int16_t a[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ #if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) @@ -226,7 +226,7 @@ static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], * - a: const pointer to input byte aray * (of MLKEM_POLYBYTES bytes) **************************************************/ -static INLINE void poly_frombytes_native(poly *a, +static INLINE void poly_frombytes_native(int16_t a[MLKEM_N], const uint8_t r[MLKEM_POLYBYTES]); #endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/default.h similarity index 97% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/default.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/default.h index d1e41c52e..f9fe4310a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/default.h @@ -8,7 +8,7 @@ /* * Default arithmetic backend */ -#include "sys.h" +#include "../sys.h" #ifdef SYS_AARCH64 /* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c deleted file mode 100644 index 3651c8da9..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) - -#include -#include "arith_backend.h" -#include "debug.h" -#include "ntt.h" -#include "reduce.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) -#define ntt_layer MLKEM_NAMESPACE(ntt_layer) -#define invntt_layer MLKEM_NAMESPACE(invntt_layer) -/* End of static namespacing */ - -#if !defined(MLKEM_USE_NATIVE_NTT) -/* - * Computes a block CT butterflies with a fixed twiddle factor, - * using Montgomery multiplication. - * Parameters: - * - r: Pointer to base of polynomial (_not_ the base of butterfly block) - * - root: Twiddle factor to use for the butterfly. This must be in - * Montgomery form and signed canonical. - * - start: Offset to the beginning of the butterfly block - * - len: Index difference between coefficients subject to a butterfly - * - bound: Ghost variable describing coefficient bound: Prior to `start`, - * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, - * they must be bound by `bound`. - * When this function returns, output coefficients in the index range - * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. - * Example: - * - start=8, len=4 - * This would compute the following four butterflies - * 8 -- 12 - * 9 -- 13 - * 10 -- 14 - * 11 -- 15 - * - start=4, len=2 - * This would compute the following two butterflies - * 4 -- 6 - * 5 -- 7 - */ -static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, - unsigned start, unsigned len, int bound) -__contract__( - requires(start < MLKEM_N) - requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) - requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) - requires(-HALF_Q < zeta && zeta < HALF_Q) - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) - requires(array_abs_bound(r, start, MLKEM_N, bound)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) - ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) -{ - /* `bound` is a ghost variable only needed in the CBMC specification */ - unsigned j; - ((void)bound); - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - /* - * Coefficients are updated in strided pairs, so the bounds for the - * intermediate states alternate twice between the old and new bound - */ - invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j, start + len, bound)) - invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) - { - int16_t t; - t = fqmul(r[j + len], zeta); - r[j + len] = r[j] - t; - r[j] = r[j] + t; - } -} - -/* - *Compute one layer of forward NTT - * Parameters: - * - r: Pointer to base of polynomial - * - len: Stride of butterflies in this layer. - * - layer: Ghost variable indicating which layer is being applied. - * Must match `len` via `len == MLKEM_N >> layer`. - * Note: `len` could be dropped and computed in the function, but - * we are following the structure of the reference NTT from the - * official Kyber implementation here, merely adding `layer` as - * a ghost variable for the specifications. - */ -static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) - requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable only needed in the CBMC specification */ - ((void)layer); - /* Twiddle factors for layer n start at index 2^(layer-1) */ - k = MLKEM_N / (2 * len); - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(start < MLKEM_N + 2 * len) - invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) - invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) - invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) - { - int16_t zeta = zetas[k++]; - ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); - } -} - -/* - * Compute full forward NTT - * NOTE: This particular implementation satisfies a much tighter - * bound on the output coefficients (5*q) than the contractual one (8*q), - * but this is not needed in the calling code. Should we change the - * base multiplication strategy to require smaller NTT output bounds, - * the proof may need strengthening. - */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - unsigned len, layer; - int16_t *r; - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - r = p->coeffs; - - for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) - __loop__( - invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) - invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) - { - ntt_layer(r, len, layer); - } - - /* Check the stronger bound */ - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_NTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - ntt_native(p); - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_NTT */ - -#if !defined(MLKEM_USE_NATIVE_INTT) - -/* Compute one layer of inverse NTT */ -static void invntt_layer(int16_t *r, unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) - requires(len == (1 << (8 - layer))) - requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable used only in the specification */ - ((void)layer); - k = MLKEM_N / len - 1; - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - invariant(start <= MLKEM_N && k <= 127) - /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ - invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) - { - unsigned j; - int16_t zeta = zetas[k--]; - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - invariant(start <= MLKEM_N && k <= 127) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - int16_t t = r[j]; - r[j] = barrett_reduce(t + r[j + len]); - r[j + len] = r[j + len] - t; - r[j + len] = fqmul(r[j + len], zeta); - } - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - /* - * Scale input polynomial to account for Montgomery factor - * and NTT twist. This also brings coefficients down to - * absolute value < MLKEM_Q. - */ - unsigned j, len, layer; - const int16_t f = 1441; - int16_t *r = p->coeffs; - - for (j = 0; j < MLKEM_N; j++) - __loop__( - invariant(j <= MLKEM_N) - invariant(array_abs_bound(r, 0, j, MLKEM_Q))) - { - r[j] = fqmul(r[j], f); - } - - /* Run the invNTT layers */ - for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) - __loop__( - invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - invntt_layer(p->coeffs, len, layer); - } - - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - intt_native(p); - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -{ - int32_t t0, t1; - debug_assert_bound(a, 2, 0, UINT12_LIMIT); - - t0 = (int32_t)a[1] * b_cached; - t0 += (int32_t)a[0] * b[0]; - t1 = (int32_t)a[0] * b[1]; - t1 += (int32_t)a[1] * b[0]; - - /* |ti| < 2 * q * 2^15 */ - r[0] = montgomery_reduce(t0); - r[1] = montgomery_reduce(t1); - - debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); -} - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt) -int empty_cu_ntt; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h deleted file mode 100644 index 4e80d3ab3..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef NTT_H -#define NTT_H -#include "common.h" - -#include -#include "cbmc.h" -#include "poly.h" -#include "reduce.h" - -#define zetas MLKEM_NAMESPACE(zetas) -extern const int16_t zetas[128]; - -#define poly_ntt MLKEM_NAMESPACE(poly_ntt) -/************************************************* - * Name: poly_ntt - * - * Description: Computes negacyclic number-theoretic transform (NTT) of - * a polynomial in place. - * - * The input is assumed to be in normal order and - * coefficient-wise bound by MLKEM_Q in absolute value. - * - * The output polynomial is in bitreversed order, and - * coefficient-wise bound by NTT_BOUND in absolute value. - * - * (NOTE: Sometimes the input to the NTT is actually smaller, - * which gives better bounds.) - * - * Arguments: - poly *p: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) -); - -#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) -/************************************************* - * Name: poly_invntt_tomont - * - * Description: Computes inverse of negacyclic number-theoretic transform (NTT) - * of a polynomial in place; - * inputs assumed to be in bitreversed order, output in normal - * order - * - * The input is assumed to be in bitreversed order, and can - * have arbitrary coefficients in int16_t. - * - * The output polynomial is in normal order, and - * coefficient-wise bound by INVNTT_BOUND in absolute value. - * - * Arguments: - uint16_t *a: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) -); - -#define basemul_cached MLKEM_NAMESPACE(basemul_cached) -/************************************************************ - * Name: basemul_cached - * - * Description: Computes a representative modulo q of - * (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536 - * - * If b_cached is b1*zeta, this represents the - * product of (a0 + a1*X) and (b0 + b1*X) in - * Fq[X]/(X^2 - zeta). - * - * Arguments: - r: Pointer to output polynomial - * Upon return, coefficients are bound by - * 2*MLKEM_Q in absolute value. - * - a: Pointer to first input polynomial - * Every coefficient must be in [0..4095] - * - b: Pointer to second input polynomial - * Can have arbitrary int16_t coefficients - * - b_cached: Some precomputed value, typically derived from - * b1 and a twiddle factor. Can be an arbitary int16_t. - ************************************************************/ -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -__contract__( - requires(memory_no_alias(r, 2 * sizeof(int16_t))) - requires(memory_no_alias(a, 2 * sizeof(int16_t))) - requires(memory_no_alias(b, 2 * sizeof(int16_t))) - requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) - assigns(memory_slice(r, 2 * sizeof(int16_t))) - ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)) -); - -#endif /* NTT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h index 57ea4c8ba..7f6c12625 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h @@ -18,6 +18,7 @@ #define MLKEM_N 256 #define MLKEM_Q 3329 #define UINT12_LIMIT 4096 +#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define MLKEM_SSBYTES 32 /* size in bytes of shared key */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c index 7483ebf6d..e8a2e2c6e 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c @@ -8,388 +8,246 @@ #include #include #include "arith_backend.h" -#include "cbd.h" #include "cbmc.h" #include "debug.h" #include "fips202x4.h" -#include "ntt.h" #include "poly.h" -#include "reduce.h" +#include "sampling.h" #include "symmetric.h" #include "verify.h" -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 16))) - { - t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); - } - - r[i * 4] = t[0] | (t[1] << 4); - r[i * 4 + 1] = t[2] | (t[3] << 4); - r[i * 4 + 2] = t[4] | (t[5] << 4); - r[i * 4 + 3] = t[6] | (t[7] << 4); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - for (j = 0; j < MLKEM_N / 4; j++) - __loop__(invariant(j <= MLKEM_N / 4)) - { - unsigned k; - uint16_t t[4]; - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(forall(r, 0, k, t[r] < (1u << 10)))) - { - t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); - } - - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 10-bit in size. - */ - r[5 * j + 0] = (t[0] >> 0) & 0xFF; - r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); - r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); - r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); - r[5 * j + 4] = (t[3] >> 2); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) - { - r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); - r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) +#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) +#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) +#define fqmul MLKEM_NAMESPACE(fqmul) +#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) +#define basemul_cached MLKEM_NAMESPACE(basemul_cached) +#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) +#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) +#define ntt_layer MLKEM_NAMESPACE(ntt_layer) +#define invntt_layer MLKEM_NAMESPACE(invntt_layer) +/* End of static namespacing */ + +/************************************************* + * Name: cast_uint16_to_int16 + * + * Description: Cast uint16 value to int16 + * + * Returns: + * input x in 0 .. 32767: returns value unchanged + * input x in 32768 .. 65535: returns (x - 65536) + **************************************************/ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "conversion" +#endif +ALWAYS_INLINE +static INLINE int16_t cast_uint16_to_int16(uint16_t x) { - unsigned j; - for (j = 0; j < MLKEM_N / 4; j++) - __loop__( - invariant(j <= MLKEM_N / 4) - invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[4]; - uint8_t const *base = &a[5 * j]; - - t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); - t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); - t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); - - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) - { - r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + /* + * PORTABILITY: This relies on uint16_t -> int16_t + * being implemented as the inverse of int16_t -> uint16_t, + * which is implementation-defined (C99 6.3.1.3 (3)) + * CBMC (correctly) fails to prove this conversion is OK, + * so we have to suppress that check here + */ + return (int16_t)x; } -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ +#ifdef CBMC +#pragma CPROVER check pop +#endif -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +/************************************************* + * Name: montgomery_reduce_generic + * + * Description: Generic Montgomery reduction; given a 32-bit integer a, computes + * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 + * + * Arguments: - int32_t a: input integer to be reduced + * + * Returns: integer congruent to a * R^-1 modulo q, with absolute value + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 + * + **************************************************/ +ALWAYS_INLINE +static INLINE int16_t montgomery_reduce_generic(int32_t a) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ + const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 32))) - { - t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); - } + /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ + const uint16_t a_reduced = a & UINT16_MAX; + const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC, and use array indexing into - * r rather than pointer-arithmetic to simplify verification - */ - r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); - r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); - r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); - r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); - r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); - } -} + /* Lift to signed canonical representative mod 2^16. */ + const int16_t t = cast_uint16_to_int16(a_inverted); -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + int32_t r = a - ((int32_t)t * MLKEM_Q); + /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - for (j = 0; j < MLKEM_N / 8; j++) - __loop__(invariant(j <= MLKEM_N / 8)) - { - unsigned k; - uint16_t t[8]; - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(forall(r, 0, k, t[r] < (1u << 11)))) - { - t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); - } + /* + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + r = r >> 16; + /* Bounds: |r >> 16| <= ceil(|r| / 2^16) + * <= ceil(|a| / 2^16 + MLKEM_Q / 2) + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * + * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) + */ - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 11-bit in size. - */ - r[11 * j + 0] = (t[0] >> 0) & 0xFF; - r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); - r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); - r[11 * j + 3] = (t[2] >> 2) & 0xFF; - r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); - r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); - r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); - r[11 * j + 7] = (t[5] >> 1) & 0xFF; - r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); - r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); - r[11 * j + 10] = (t[7] >> 3); - } + return (int16_t)r; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +/************************************************* + * Name: montgomery_reduce + * + * Description: Montgomery reduction + * + * Arguments: - int32_t a: input integer to be reduced + * Must be smaller than 2 * 2^12 * 2^15 in absolute value. + * + * Returns: integer congruent to a * R^-1 modulo q, + * smaller than 2 * q in absolute value. + **************************************************/ +static INLINE int16_t montgomery_reduce(int32_t a) +__contract__( + requires(a > -(2 * UINT12_LIMIT * 32768)) + requires(a < (2 * UINT12_LIMIT * 32768)) + ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) +) { - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - uint8_t t[8]; - const unsigned offset = i * 5; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC and unwind loop for ease - * of proof. - */ - - /* - * Decompress 5 8-bit bytes (so 40 bits) into - * 8 5-bit values stored in t[] - */ - t[0] = 0x1F & (a[offset + 0] >> 0); - t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); - t[2] = 0x1F & (a[offset + 1] >> 2); - t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); - t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); - t[5] = 0x1F & (a[offset + 3] >> 1); - t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); - t[7] = 0x1F & (a[offset + 4] >> 3); - - /* and copy to the correct slice in r[] */ - for (j = 0; j < 8; j++) - __loop__( - invariant(j <= 8 && i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + int16_t res; + debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); + + res = montgomery_reduce_generic(a); + /* Bounds: + * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 + * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 + * < 2 * MLKEM_Q */ + + debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); + return res; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: fqmul + * + * Description: Montgomery multiplication modulo q=3329 + * + * Arguments: - int16_t a: first factor + * Can be any int16_t. + * - int16_t b: second factor. + * Must be signed canonical (abs value <(q+1)/2) + * + * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and + * smaller than q in absolute value. + * + **************************************************/ +static INLINE int16_t fqmul(int16_t a, int16_t b) +__contract__( + requires(b > -HALF_Q) + requires(b < HALF_Q) + ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) +) { - unsigned j; - for (j = 0; j < MLKEM_N / 8; j++) - __loop__( - invariant(j <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[8]; - uint8_t const *base = &a[11 * j]; - t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); - t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | - ((uint16_t)base[4] << 10)); - t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); - t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); - t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | - ((uint16_t)base[8] << 9)); - t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); - t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); - - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) - { - r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); - } - } + int16_t res; + debug_assert_abs_bound(&b, 1, HALF_Q); + + res = montgomery_reduce((int32_t)a * (int32_t)b); + /* Bounds: + * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 + * < MLKEM_Q + */ - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + debug_assert_abs_bound(&res, 1, MLKEM_Q); + return res; } -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ - -#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: barrett_reduce + * + * Description: Barrett reduction; given a 16-bit integer a, computes + * centered representative congruent to a mod q in + * {-(q-1)/2,...,(q-1)/2} + * + * Arguments: - int16_t a: input integer to be reduced + * + * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. + **************************************************/ +static INLINE int16_t barrett_reduce(int16_t a) +__contract__( + ensures(return_value > -HALF_Q && return_value < HALF_Q) +) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 2; i++) - __loop__(invariant(i <= MLKEM_N / 2)) - { - const uint16_t t0 = a->coeffs[2 * i]; - const uint16_t t1 = a->coeffs[2 * i + 1]; - /* - * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of - * significant data, so these can be packed into 24 bits or exactly - * 3 bytes, as follows. - */ - - /* Least significant bits 0 - 7 of t0. */ - r[3 * i + 0] = t0 & 0xFF; - - /* - * Most significant bits 8 - 11 of t0 become the least significant - * nibble of the second byte. The least significant 4 bits - * of t1 become the upper nibble of the second byte. - */ - r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + /* + * To divide by MLKEM_Q using Barrett multiplication, the "magic number" + * multiplier is round_to_nearest(2**26/MLKEM_Q) + */ + const int BPOWER = 26; + const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - /* Bits 4 - 11 of t1 become the third byte. */ - r[3 * i + 2] = t1 >> 4; - } -} -#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -{ - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - poly_tobytes_native(r, a); -} -#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + /* + * Compute round_to_nearest(a/MLKEM_Q) using the multiplier + * above and shift by BPOWER places. + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; -#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) - { - const uint8_t t0 = a[3 * i + 0]; - const uint8_t t1 = a[3 * i + 1]; - const uint8_t t2 = a[3 * i + 2]; - r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); - r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); - } + /* + * t is in -10 .. +10, so we need 32-bit math to + * evaluate t * MLKEM_Q and the subsequent subtraction + */ + int16_t res = (int16_t)(a - t * MLKEM_Q); - /* Note that the coefficients are not canonical */ - debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); -} -#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - poly_frombytes_native(r, a); + debug_assert_abs_bound(&res, 1, HALF_Q); + return res; } -#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ - -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], + int16_t b_cached) +__contract__( + requires(memory_no_alias(r, 2 * sizeof(int16_t))) + requires(memory_no_alias(a, 2 * sizeof(int16_t))) + requires(memory_no_alias(b, 2 * sizeof(int16_t))) + requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) + assigns(memory_slice(r, 2 * sizeof(int16_t))) + ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))) { - unsigned i; -#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) -#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" -#endif + int32_t t0, t1; + debug_assert_bound(a, 2, 0, UINT12_LIMIT); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - for (j = 0; j < 8; j++) - __loop__( - invariant(i < MLKEM_N / 8 && j <= 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - /* Prevent the compiler from recognizing this as a bit selection */ - uint8_t mask = value_barrier_u8(1u << j); - r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); - } - } - debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); -} + t0 = (int32_t)a[1] * b_cached; + t0 += (int32_t)a[0] * b[0]; + t1 = (int32_t)a[0] * b[1]; + t1 += (int32_t)a[1] * b[0]; -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* |ti| < 2 * q * 2^15 */ + r[0] = montgomery_reduce(t0); + r[1] = montgomery_reduce(t1); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - msg[i] = 0; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8)) - { - uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); - msg[i] |= t << j; - } - } + debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); } MLKEM_NATIVE_INTERNAL_API @@ -434,12 +292,46 @@ void poly_tomont(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_tomont(poly *r) { - poly_tomont_native(r); + poly_tomont_native(r->coeffs); debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) +/************************************************************ + * Name: scalar_signed_to_unsigned_q + * + * Description: converts signed polynomial coefficient + * from signed (-3328 .. 3328) form to + * unsigned form (0 .. 3328). + * + * Note: Cryptographic constant time implementation + * + * Examples: 0 -> 0 + * 1 -> 1 + * 3328 -> 3328 + * -1 -> 3328 + * -2 -> 3327 + * -3328 -> 1 + * + * Arguments: c: signed coefficient to be converted + ************************************************************/ +static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) +__contract__( + requires(c > -MLKEM_Q && c < MLKEM_Q) + ensures(return_value >= 0 && return_value < MLKEM_Q) + ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) +{ + debug_assert_abs_bound(&c, 1, MLKEM_Q); + + /* Add Q if c is negative, but in constant time */ + c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); + + /* and therefore cast to uint16_t is safe. */ + debug_assert_bound(&c, 1, 0, MLKEM_Q); + return (uint16_t)c; +} + MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { @@ -461,7 +353,7 @@ void poly_reduce(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { - poly_reduce_native(r); + poly_reduce_native(r->coeffs); debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ @@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a) MLKEM_NATIVE_INTERNAL_API void poly_mulcache_compute(poly_mulcache *x, const poly *a) { - poly_mulcache_compute_native(x, a); + poly_mulcache_compute_native(x->coeffs, a->coeffs); /* Omitting bounds assertion since native implementations may * decide not to use a mulcache. Note that the C backend implementation * of poly_basemul_montgomery_cached() does still include the check. */ } #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ +#if !defined(MLKEM_USE_NATIVE_NTT) +/* + * Computes a block CT butterflies with a fixed twiddle factor, + * using Montgomery multiplication. + * Parameters: + * - r: Pointer to base of polynomial (_not_ the base of butterfly block) + * - root: Twiddle factor to use for the butterfly. This must be in + * Montgomery form and signed canonical. + * - start: Offset to the beginning of the butterfly block + * - len: Index difference between coefficients subject to a butterfly + * - bound: Ghost variable describing coefficient bound: Prior to `start`, + * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, + * they must be bound by `bound`. + * When this function returns, output coefficients in the index range + * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. + * Example: + * - start=8, len=4 + * This would compute the following four butterflies + * 8 -- 12 + * 9 -- 13 + * 10 -- 14 + * 11 -- 15 + * - start=4, len=2 + * This would compute the following two butterflies + * 4 -- 6 + * 5 -- 7 + */ +static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, + unsigned start, unsigned len, int bound) +__contract__( + requires(start < MLKEM_N) + requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) + requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) + requires(-HALF_Q < zeta && zeta < HALF_Q) + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) + requires(array_abs_bound(r, start, MLKEM_N, bound)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) + ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) +{ + /* `bound` is a ghost variable only needed in the CBMC specification */ + unsigned j; + ((void)bound); + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + /* + * Coefficients are updated in strided pairs, so the bounds for the + * intermediate states alternate twice between the old and new bound + */ + invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j, start + len, bound)) + invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) + { + int16_t t; + t = fqmul(r[j + len], zeta); + r[j + len] = r[j] - t; + r[j] = r[j] + t; + } +} + +/* + *Compute one layer of forward NTT + * Parameters: + * - r: Pointer to base of polynomial + * - len: Stride of butterflies in this layer. + * - layer: Ghost variable indicating which layer is being applied. + * Must match `len` via `len == MLKEM_N >> layer`. + * Note: `len` could be dropped and computed in the function, but + * we are following the structure of the reference NTT from the + * official Kyber implementation here, merely adding `layer` as + * a ghost variable for the specifications. + */ +static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) + requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable only needed in the CBMC specification */ + ((void)layer); + /* Twiddle factors for layer n start at index 2^(layer-1) */ + k = MLKEM_N / (2 * len); + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(start < MLKEM_N + 2 * len) + invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) + invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) + invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) + { + int16_t zeta = zetas[k++]; + ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); + } +} + +/* + * Compute full forward NTT + * NOTE: This particular implementation satisfies a much tighter + * bound on the output coefficients (5*q) than the contractual one (8*q), + * but this is not needed in the calling code. Should we change the + * base multiplication strategy to require smaller NTT output bounds, + * the proof may need strengthening. + */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + unsigned len, layer; + int16_t *r; + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + r = p->coeffs; + + for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) + __loop__( + invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) + invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) + { + ntt_layer(r, len, layer); + } + + /* Check the stronger bound */ + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_NTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + ntt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_NTT */ + +#if !defined(MLKEM_USE_NATIVE_INTT) + +/* Compute one layer of inverse NTT */ +static void invntt_layer(int16_t *r, unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) + requires(len == (1 << (8 - layer))) + requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable used only in the specification */ + ((void)layer); + k = MLKEM_N / len - 1; + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + invariant(start <= MLKEM_N && k <= 127) + /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ + invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) + { + unsigned j; + int16_t zeta = zetas[k--]; + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + invariant(start <= MLKEM_N && k <= 127) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + int16_t t = r[j]; + r[j] = barrett_reduce(t + r[j + len]); + r[j + len] = r[j + len] - t; + r[j + len] = fqmul(r[j + len], zeta); + } + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + /* + * Scale input polynomial to account for Montgomery factor + * and NTT twist. This also brings coefficients down to + * absolute value < MLKEM_Q. + */ + unsigned j, len, layer; + const int16_t f = 1441; + int16_t *r = p->coeffs; + + for (j = 0; j < MLKEM_N; j++) + __loop__( + invariant(j <= MLKEM_N) + invariant(array_abs_bound(r, 0, j, MLKEM_Q))) + { + r[j] = fqmul(r[j], f); + } + + /* Run the invNTT layers */ + for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) + __loop__( + invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + invntt_layer(p->coeffs, len, layer); + } + + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_INTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + intt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_INTT */ + #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h index 6a14c785d..cb0d67c1a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h @@ -9,7 +9,7 @@ #include #include "cbmc.h" #include "common.h" -#include "reduce.h" +#include "debug.h" #include "verify.h" /* Absolute exclusive upper bound for the output of the inverse NTT */ @@ -18,6 +18,9 @@ /* Absolute exclusive upper bound for the output of the forward NTT */ #define NTT_BOUND (8 * MLKEM_Q) +#define zetas MLKEM_NAMESPACE(zetas) +extern const int16_t zetas[128]; + /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1] @@ -38,520 +41,6 @@ typedef struct int16_t coeffs[MLKEM_N >> 1]; } poly_mulcache; -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) -#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) -#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) -#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) -#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) -#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) -#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) -#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) -#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) -#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) -/* End of static namespacing */ - -/************************************************************ - * Name: scalar_compress_d1 - * - * Description: Computes round(u * 2 / q) - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 1. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d1(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 2) - ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) -{ - uint32_t d0 = u << 1; - d0 *= 645083; - d0 += 1u << 30; - d0 >>= 31; - return d0; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_compress_d4 - * - * Description: Computes round(u * 16 / q) % 16 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d4(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 16) - ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) -{ - uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ - return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d4 - * - * Description: Computes round(u * q / 16) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d4(uint32_t u) -__contract__( - requires(0 <= u && u < 16) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 8) / 16; } - -/************************************************************ - * Name: scalar_compress_d5 - * - * Description: Computes round(u * 32 / q) % 32 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d5(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 32) - ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) -{ - uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ - return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d5 - * - * Description: Computes round(u * q / 32) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo 32 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d5(uint32_t u) -__contract__( - requires(0 <= u && u < 32) - ensures(return_value <= MLKEM_Q - 1) -) { return ((u * MLKEM_Q) + 16) / 32; } - -/************************************************************ - * Name: scalar_compress_d10 - * - * Description: Computes round(u * 2**10 / q) % 2**10 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d10(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 10)) - ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) -{ - uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x3FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d10 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d10(uint32_t u) -__contract__( - requires(0 <= u && u < 1024) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 512) / 1024; } - -/************************************************************ - * Name: scalar_compress_d11 - * - * Description: Computes round(u * 2**11 / q) % 2**11 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 11. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d11(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 11)) - ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) -{ - uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x7FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d11 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d11(uint32_t u) -__contract__( - requires(0 <= u && u < 2048) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 1024) / 2048; } - -/************************************************************ - * Name: scalar_signed_to_unsigned_q - * - * Description: converts signed polynomial coefficient - * from signed (-3328 .. 3328) form to - * unsigned form (0 .. 3328). - * - * Note: Cryptographic constant time implementation - * - * Examples: 0 -> 0 - * 1 -> 1 - * 3328 -> 3328 - * -1 -> 3328 - * -2 -> 3327 - * -3328 -> 1 - * - * Arguments: c: signed coefficient to be converted - ************************************************************/ -static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) -__contract__( - requires(c > -MLKEM_Q && c < MLKEM_Q) - ensures(return_value >= 0 && return_value < MLKEM_Q) - ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) -{ - debug_assert_abs_bound(&c, 1, MLKEM_Q); - - /* Add Q if c is negative, but in constant time */ - c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); - - /* and therefore cast to uint16_t is safe. */ - debug_assert_bound(&c, 1, 0, MLKEM_Q); - return (uint16_t)c; -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ - (MLKEM_K == 2 || MLKEM_K == 3) -#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) -/************************************************* - * Name: poly_compress_d4 - * - * Description: Compression (4 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); - -#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) -/************************************************* - * Name: poly_compress_d10 - * - * Description: Compression (10 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); - -#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) -/************************************************* - * Name: poly_decompress_d4 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); - -#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) -/************************************************* - * Name: poly_decompress_d10 - * - * Description: De-serialization and subsequent decompression (10 bits) of a - * polynomial; approximate inverse of poly_compress_d10 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) -/************************************************* - * Name: poly_compress_d5 - * - * Description: Compression (5 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); - -#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) -/************************************************* - * Name: poly_compress_d11 - * - * Description: Compression (11 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); - -#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) -/************************************************* - * Name: poly_decompress_d5 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); - -#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) -/************************************************* - * Name: poly_decompress_d11 - * - * Description: De-serialization and subsequent decompression (11 bits) of a - * polynomial; approximate inverse of poly_compress_d11 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ - */ - -#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) -/************************************************* - * Name: poly_tobytes - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - a: const pointer to input polynomial, - * with each coefficient in the range [0,1,..,Q-1] - * OUTPUT - * - r: pointer to output byte array - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYBYTES)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(r)) -); - - -#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) -/************************************************* - * Name: poly_frombytes - * - * Description: De-serialization of a polynomial. - * - * Arguments: INPUT - * - a: pointer to input byte array - * (of MLKEM_POLYBYTES bytes) - * OUTPUT - * - r: pointer to output polynomial, with - * each coefficient unsigned and in the range - * 0 .. 4095 - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) -); - - -#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) -/************************************************* - * Name: poly_frommsg - * - * Description: Convert 32-byte message to polynomial - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *msg: pointer to input message - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(object_whole(r)) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) -); - -#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) -/************************************************* - * Name: poly_tomsg - * - * Description: Convert polynomial to 32-byte message - * - * Arguments: - uint8_t *msg: pointer to output message - * - const poly *r: pointer to input polynomial - * Coefficients must be unsigned canonical - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(msg)) -); - #define poly_basemul_montgomery_cached \ MLKEM_NAMESPACE(poly_basemul_montgomery_cached) /************************************************* @@ -715,4 +204,56 @@ __contract__( assigns(object_whole(r)) ); +#define poly_ntt MLKEM_NAMESPACE(poly_ntt) +/************************************************* + * Name: poly_ntt + * + * Description: Computes negacyclic number-theoretic transform (NTT) of + * a polynomial in place. + * + * The input is assumed to be in normal order and + * coefficient-wise bound by MLKEM_Q in absolute value. + * + * The output polynomial is in bitreversed order, and + * coefficient-wise bound by NTT_BOUND in absolute value. + * + * (NOTE: Sometimes the input to the NTT is actually smaller, + * which gives better bounds.) + * + * Arguments: - poly *p: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) +); + +#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) +/************************************************* + * Name: poly_invntt_tomont + * + * Description: Computes inverse of negacyclic number-theoretic transform (NTT) + * of a polynomial in place; + * inputs assumed to be in bitreversed order, output in normal + * order + * + * The input is assumed to be in bitreversed order, and can + * have arbitrary coefficients in int16_t. + * + * The output polynomial is in normal order, and + * coefficient-wise bound by INVNTT_BOUND in absolute value. + * + * Arguments: - uint16_t *a: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) +); + #endif /* POLY_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly_k.c similarity index 97% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.c rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly_k.c index 50ea1c34a..c2d330ea9 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly_k.c @@ -2,13 +2,12 @@ * Copyright (c) 2024 The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 */ -#include "polyvec.h" +#include "poly_k.h" #include #include #include "arith_backend.h" -#include "cbd.h" -#include "ntt.h" -#include "poly.h" +#include "compress.h" +#include "sampling.h" #include "symmetric.h" #include "debug.h" @@ -131,7 +130,9 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, /* Omitting bounds assertion for cache since native implementations may * decide not to use a mulcache. Note that the C backend implementation * of poly_basemul_montgomery_cached() does still include the check. */ - polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache); + polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a, + (const int16_t *)b, + (const int16_t *)b_cache); } #endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly_k.h similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly_k.h index 8be8579e0..0aea95912 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly_k.h @@ -2,11 +2,12 @@ * Copyright (c) 2024 The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 */ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef POLY_K_H +#define POLY_K_H #include #include "common.h" +#include "compress.h" #include "poly.h" #define polyvec MLKEM_NAMESPACE_K(polyvec) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.h deleted file mode 100644 index b432a4201..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef REDUCE_H -#define REDUCE_H - -#include -#include "cbmc.h" -#include "common.h" -#include "debug.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) -#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) -#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) -#define fqmul MLKEM_NAMESPACE(fqmul) -#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) -/* End of static namespacing */ - -#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ - -/************************************************* - * Name: cast_uint16_to_int16 - * - * Description: Cast uint16 value to int16 - * - * Returns: - * input x in 0 .. 32767: returns value unchanged - * input x in 32768 .. 65535: returns (x - 65536) - **************************************************/ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "conversion" -#endif -ALWAYS_INLINE -static INLINE int16_t cast_uint16_to_int16(uint16_t x) -{ - /* - * PORTABILITY: This relies on uint16_t -> int16_t - * being implemented as the inverse of int16_t -> uint16_t, - * which is implementation-defined (C99 6.3.1.3 (3)) - * CBMC (correctly) fails to prove this conversion is OK, - * so we have to suppress that check here - */ - return (int16_t)x; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************* - * Name: montgomery_reduce_generic - * - * Description: Generic Montgomery reduction; given a 32-bit integer a, computes - * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 - * - * Arguments: - int32_t a: input integer to be reduced - * - * Returns: integer congruent to a * R^-1 modulo q, with absolute value - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 - * - **************************************************/ -ALWAYS_INLINE -static INLINE int16_t montgomery_reduce_generic(int32_t a) -{ - /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ - const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - - /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ - const uint16_t a_reduced = a & UINT16_MAX; - const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - - /* Lift to signed canonical representative mod 2^16. */ - const int16_t t = cast_uint16_to_int16(a_inverted); - - int32_t r = a - ((int32_t)t * MLKEM_Q); - /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - - /* - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - r = r >> 16; - /* Bounds: |r >> 16| <= ceil(|r| / 2^16) - * <= ceil(|a| / 2^16 + MLKEM_Q / 2) - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * - * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) - */ - - return (int16_t)r; -} - -/************************************************* - * Name: montgomery_reduce - * - * Description: Montgomery reduction - * - * Arguments: - int32_t a: input integer to be reduced - * Must be smaller than 2 * 2^12 * 2^15 in absolute value. - * - * Returns: integer congruent to a * R^-1 modulo q, - * smaller than 2 * q in absolute value. - **************************************************/ -static INLINE int16_t montgomery_reduce(int32_t a) -__contract__( - requires(a > -(2 * UINT12_LIMIT * 32768)) - requires(a < (2 * UINT12_LIMIT * 32768)) - ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); - - res = montgomery_reduce_generic(a); - /* Bounds: - * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 - * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 - * < 2 * MLKEM_Q */ - - debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); - return res; -} - -/************************************************* - * Name: fqmul - * - * Description: Montgomery multiplication modulo q=3329 - * - * Arguments: - int16_t a: first factor - * Can be any int16_t. - * - int16_t b: second factor. - * Must be signed canonical (abs value <(q+1)/2) - * - * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and - * smaller than q in absolute value. - * - **************************************************/ -static INLINE int16_t fqmul(int16_t a, int16_t b) -__contract__( - requires(b > -HALF_Q) - requires(b < HALF_Q) - ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&b, 1, HALF_Q); - - res = montgomery_reduce((int32_t)a * (int32_t)b); - /* Bounds: - * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 - * < MLKEM_Q - */ - - debug_assert_abs_bound(&res, 1, MLKEM_Q); - return res; -} - -/************************************************* - * Name: barrett_reduce - * - * Description: Barrett reduction; given a 16-bit integer a, computes - * centered representative congruent to a mod q in - * {-(q-1)/2,...,(q-1)/2} - * - * Arguments: - int16_t a: input integer to be reduced - * - * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. - **************************************************/ -static INLINE int16_t barrett_reduce(int16_t a) -__contract__( - ensures(return_value > -HALF_Q && return_value < HALF_Q) -) -{ - /* - * To divide by MLKEM_Q using Barrett multiplication, the "magic number" - * multiplier is round_to_nearest(2**26/MLKEM_Q) - */ - const int BPOWER = 26; - const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - - /* - * Compute round_to_nearest(a/MLKEM_Q) using the multiplier - * above and shift by BPOWER places. - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; - - /* - * t is in -10 .. +10, so we need 32-bit math to - * evaluate t * MLKEM_Q and the subsequent subtraction - */ - int16_t res = (int16_t)(a - t * MLKEM_Q); - - debug_assert_abs_bound(&res, 1, HALF_Q); - return res; -} - -#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/sampling.c similarity index 73% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.c rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/sampling.c index cbbe4407f..98cbdcb74 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/sampling.c @@ -9,7 +9,7 @@ #include "debug.h" #include "fips202.h" #include "fips202x4.h" -#include "rej_uniform.h" +#include "sampling.h" #include "symmetric.h" /* Static namespacing @@ -18,6 +18,8 @@ * within a single compilation unit. */ #define rej_uniform MLKEM_NAMESPACE(rej_uniform) #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar) +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) /* End of static namespacing */ static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target, @@ -233,9 +235,113 @@ void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) xof_release(&state); } +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) +/* End of static namespacing */ + +/************************************************* + * Name: load32_littleendian + * + * Description: load 4 bytes into a 32-bit integer + * in little-endian order + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x + **************************************************/ +static uint32_t load32_littleendian(const uint8_t x[4]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + r |= (uint32_t)x[3] << 24; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) + { + unsigned j; + uint32_t t = load32_littleendian(buf + 4 * i); + uint32_t d = t & 0x55555555; + d += (t >> 1) & 0x55555555; + + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) + { + const int16_t a = (d >> (4 * j + 0)) & 0x3; + const int16_t b = (d >> (4 * j + 2)) & 0x3; + r->coeffs[8 * i + j] = a - b; + } + } +} + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +/************************************************* + * Name: load24_littleendian + * + * Description: load 3 bytes into a 32-bit integer + * in little-endian order. + * This function is only needed for ML-KEM-512 + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) + **************************************************/ +static uint32_t load24_littleendian(const uint8_t x[3]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 4; i++) + __loop__( + invariant(i <= MLKEM_N / 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) + { + unsigned j; + const uint32_t t = load24_littleendian(buf + 3 * i); + uint32_t d = t & 0x00249249; + d += (t >> 1) & 0x00249249; + d += (t >> 2) & 0x00249249; + + for (j = 0; j < 4; j++) + __loop__( + invariant(i <= MLKEM_N / 4 && j <= 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) + { + const int16_t a = (d >> (6 * j + 0)) & 0x7; + const int16_t b = (d >> (6 * j + 3)) & 0x7; + r->coeffs[4 * i + j] = a - b; + } + } +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ + 3 */ + #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ -#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform) -int empty_cu_rej_uniform; +#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling) +int empty_cu_sampling; #endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/sampling.h similarity index 63% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/sampling.h index 801287259..cc524e0fc 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/sampling.h @@ -2,8 +2,8 @@ * Copyright (c) 2024 The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 */ -#ifndef REJ_UNIFORM_H -#define REJ_UNIFORM_H +#ifndef SAMPLING_H +#define SAMPLING_H #include #include @@ -11,6 +11,37 @@ #include "common.h" #include "poly.h" +#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) +/************************************************* + * Name: poly_cbd2 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=2 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) +/************************************************* + * Name: poly_cbd3 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=3. + * This function is only needed for ML-KEM-512 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ + #define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4) /************************************************* * Name: poly_rej_uniform_x4 @@ -60,4 +91,4 @@ __contract__( assigns(memory_slice(entry, sizeof(poly))) ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q))); -#endif /* REJ_UNIFORM_H */ +#endif /* SAMPLING_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c index 4ef887c62..987f0dce4 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c @@ -10,7 +10,7 @@ #include "common.h" #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) -#include "ntt.h" +#include "poly.h" /* * Table of zeta values used in the reference NTT and inverse NTT. diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/arith_backend.h index 0543b1bd1..ade31cda1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/arith_backend.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/arith_backend.h @@ -17,7 +17,7 @@ * Keep this _after_ the inclusion of the backend; otherwise, * the sanity checks won't have an effect. */ #if defined(MLKEM_NATIVE_CHECK_APIS) -#include "api.h" +#include "native/api.h" #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.c deleted file mode 100644 index 1e6b7c5d1..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED - -#include -#include "cbd.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) -#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) -/* End of static namespacing */ - -/************************************************* - * Name: load32_littleendian - * - * Description: load 4 bytes into a 32-bit integer - * in little-endian order - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x - **************************************************/ -static uint32_t load32_littleendian(const uint8_t x[4]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - r |= (uint32_t)x[3] << 24; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) - { - unsigned j; - uint32_t t = load32_littleendian(buf + 4 * i); - uint32_t d = t & 0x55555555; - d += (t >> 1) & 0x55555555; - - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) - { - const int16_t a = (d >> (4 * j + 0)) & 0x3; - const int16_t b = (d >> (4 * j + 2)) & 0x3; - r->coeffs[8 * i + j] = a - b; - } - } -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -/************************************************* - * Name: load24_littleendian - * - * Description: load 3 bytes into a 32-bit integer - * in little-endian order. - * This function is only needed for ML-KEM-512 - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) - **************************************************/ -static uint32_t load24_littleendian(const uint8_t x[3]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 4; i++) - __loop__( - invariant(i <= MLKEM_N / 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) - { - unsigned j; - const uint32_t t = load24_littleendian(buf + 3 * i); - uint32_t d = t & 0x00249249; - d += (t >> 1) & 0x00249249; - d += (t >> 2) & 0x00249249; - - for (j = 0; j < 4; j++) - __loop__( - invariant(i <= MLKEM_N / 4 && j <= 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) - { - const int16_t a = (d >> (6 * j + 0)) & 0x7; - const int16_t b = (d >> (6 * j + 3)) & 0x7; - r->coeffs[4 * i + j] = a - b; - } - } -} -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ - 3 */ - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd) -int empty_cu_cbd; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.h deleted file mode 100644 index 54c1f5b90..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef CBD_H -#define CBD_H - -#include -#include "common.h" -#include "poly.h" - -#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) -/************************************************* - * Name: poly_cbd2 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=2 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) -/************************************************* - * Name: poly_cbd3 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=3. - * This function is only needed for ML-KEM-512 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ - -#endif /* CBD_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h index 4f326333e..62ed53ab1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h @@ -15,12 +15,19 @@ #include "sys.h" /* Include backend metadata */ -#if defined(MLKEM_USE_NATIVE) -#if defined(MLKEM_NATIVE_ARITH_BACKEND) -#include MLKEM_NATIVE_ARITH_BACKEND +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#include MLKEM_NATIVE_ARITH_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not. +#endif #endif -#if defined(MLKEM_NATIVE_FIPS202_BACKEND) -#include MLKEM_NATIVE_FIPS202_BACKEND + +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#include MLKEM_NATIVE_FIPS202_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not. #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/compress.c new file mode 100644 index 000000000..a03fe0ac4 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/compress.c @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "common.h" +#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) + +#include +#include +#include "arith_backend.h" +#include "cbmc.h" +#include "compress.h" +#include "debug.h" +#include "verify.h" + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 16))) + { + t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); + } + + r[i * 4] = t[0] | (t[1] << 4); + r[i * 4 + 1] = t[2] | (t[3] << 4); + r[i * 4 + 2] = t[4] | (t[5] << 4); + r[i * 4 + 3] = t[6] | (t[7] << 4); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + for (j = 0; j < MLKEM_N / 4; j++) + __loop__(invariant(j <= MLKEM_N / 4)) + { + unsigned k; + uint16_t t[4]; + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(forall(r, 0, k, t[r] < (1u << 10)))) + { + t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 10-bit in size. + */ + r[5 * j + 0] = (t[0] >> 0) & 0xFF; + r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); + r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); + r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); + r[5 * j + 4] = (t[3] >> 2); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) + { + r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); + r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 4; j++) + __loop__( + invariant(j <= MLKEM_N / 4) + invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[4]; + uint8_t const *base = &a[5 * j]; + + t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); + t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); + t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); + + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) + { + r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 32))) + { + t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); + } + + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC, and use array indexing into + * r rather than pointer-arithmetic to simplify verification + */ + r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); + r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); + r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); + r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); + r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (j = 0; j < MLKEM_N / 8; j++) + __loop__(invariant(j <= MLKEM_N / 8)) + { + unsigned k; + uint16_t t[8]; + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(forall(r, 0, k, t[r] < (1u << 11)))) + { + t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 11-bit in size. + */ + r[11 * j + 0] = (t[0] >> 0) & 0xFF; + r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); + r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); + r[11 * j + 3] = (t[2] >> 2) & 0xFF; + r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); + r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); + r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); + r[11 * j + 7] = (t[5] >> 1) & 0xFF; + r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); + r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); + r[11 * j + 10] = (t[7] >> 3); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + uint8_t t[8]; + const unsigned offset = i * 5; + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC and unwind loop for ease + * of proof. + */ + + /* + * Decompress 5 8-bit bytes (so 40 bits) into + * 8 5-bit values stored in t[] + */ + t[0] = 0x1F & (a[offset + 0] >> 0); + t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); + t[2] = 0x1F & (a[offset + 1] >> 2); + t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); + t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); + t[5] = 0x1F & (a[offset + 3] >> 1); + t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); + t[7] = 0x1F & (a[offset + 4] >> 3); + + /* and copy to the correct slice in r[] */ + for (j = 0; j < 8; j++) + __loop__( + invariant(j <= 8 && i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 8; j++) + __loop__( + invariant(j <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[8]; + uint8_t const *base = &a[11 * j]; + t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); + t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | + ((uint16_t)base[4] << 10)); + t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); + t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); + t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | + ((uint16_t)base[8] << 9)); + t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); + t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); + + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) + { + r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ + +#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 2; i++) + __loop__(invariant(i <= MLKEM_N / 2)) + { + const uint16_t t0 = a->coeffs[2 * i]; + const uint16_t t1 = a->coeffs[2 * i + 1]; + /* + * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of + * significant data, so these can be packed into 24 bits or exactly + * 3 bytes, as follows. + */ + + /* Least significant bits 0 - 7 of t0. */ + r[3 * i + 0] = t0 & 0xFF; + + /* + * Most significant bits 8 - 11 of t0 become the least significant + * nibble of the second byte. The least significant 4 bits + * of t1 become the upper nibble of the second byte. + */ + r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + + /* Bits 4 - 11 of t1 become the third byte. */ + r[3 * i + 2] = t1 >> 4; + } +} +#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + poly_tobytes_native(r, a->coeffs); +} +#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + +#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) + { + const uint8_t t0 = a[3 * i + 0]; + const uint8_t t1 = a[3 * i + 1]; + const uint8_t t2 = a[3 * i + 2]; + r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); + r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); + } + + /* Note that the coefficients are not canonical */ + debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); +} +#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + poly_frombytes_native(r->coeffs, a); +} +#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ + +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +{ + unsigned i; +#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) +#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" +#endif + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + for (j = 0; j < 8; j++) + __loop__( + invariant(i < MLKEM_N / 8 && j <= 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + /* Prevent the compiler from recognizing this as a bit selection */ + uint8_t mask = value_barrier_u8(1u << j); + r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); + } + } + debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + msg[i] = 0; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8)) + { + uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); + msg[i] |= t << j; + } + } +} + +#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ + +#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress) +int empty_cu_compress; + +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/compress.h new file mode 100644 index 000000000..409dbe519 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/compress.h @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef COMPRESS_H +#define COMPRESS_H + +#include +#include +#include "cbmc.h" +#include "common.h" +#include "debug.h" +#include "poly.h" +#include "verify.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) +#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) +#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) +#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) +#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) +#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) +#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) +#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) +#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) +/* End of static namespacing */ + +/************************************************************ + * Name: scalar_compress_d1 + * + * Description: Computes round(u * 2 / q) + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 1. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d1(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 2) + ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) +{ + uint32_t d0 = u << 1; + d0 *= 645083; + d0 += 1u << 30; + d0 >>= 31; + return d0; +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_compress_d4 + * + * Description: Computes round(u * 16 / q) % 16 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d4(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 16) + ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) +{ + uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ + return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d4 + * + * Description: Computes round(u * q / 16) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d4(uint32_t u) +__contract__( + requires(0 <= u && u < 16) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 8) / 16; } + +/************************************************************ + * Name: scalar_compress_d5 + * + * Description: Computes round(u * 32 / q) % 32 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d5(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 32) + ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) +{ + uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ + return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d5 + * + * Description: Computes round(u * q / 32) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo 32 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d5(uint32_t u) +__contract__( + requires(0 <= u && u < 32) + ensures(return_value <= MLKEM_Q - 1) +) { return ((u * MLKEM_Q) + 16) / 32; } + +/************************************************************ + * Name: scalar_compress_d10 + * + * Description: Computes round(u * 2**10 / q) % 2**10 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d10(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 10)) + ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) +{ + uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x3FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d10 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d10(uint32_t u) +__contract__( + requires(0 <= u && u < 1024) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 512) / 1024; } + +/************************************************************ + * Name: scalar_compress_d11 + * + * Description: Computes round(u * 2**11 / q) % 2**11 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 11. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d11(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 11)) + ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) +{ + uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x7FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d11 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d11(uint32_t u) +__contract__( + requires(0 <= u && u < 2048) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 1024) / 2048; } + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ + (MLKEM_K == 2 || MLKEM_K == 3) +#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) +/************************************************* + * Name: poly_compress_d4 + * + * Description: Compression (4 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); + +#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) +/************************************************* + * Name: poly_compress_d10 + * + * Description: Compression (10 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); + +#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) +/************************************************* + * Name: poly_decompress_d4 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); + +#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) +/************************************************* + * Name: poly_decompress_d10 + * + * Description: De-serialization and subsequent decompression (10 bits) of a + * polynomial; approximate inverse of poly_compress_d10 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) +/************************************************* + * Name: poly_compress_d5 + * + * Description: Compression (5 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); + +#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) +/************************************************* + * Name: poly_compress_d11 + * + * Description: Compression (11 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); + +#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) +/************************************************* + * Name: poly_decompress_d5 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); + +#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) +/************************************************* + * Name: poly_decompress_d11 + * + * Description: De-serialization and subsequent decompression (11 bits) of a + * polynomial; approximate inverse of poly_compress_d11 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ + */ + +#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) +/************************************************* + * Name: poly_tobytes + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - a: const pointer to input polynomial, + * with each coefficient in the range [0,1,..,Q-1] + * OUTPUT + * - r: pointer to output byte array + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYBYTES)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(r)) +); + + +#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) +/************************************************* + * Name: poly_frombytes + * + * Description: De-serialization of a polynomial. + * + * Arguments: INPUT + * - a: pointer to input byte array + * (of MLKEM_POLYBYTES bytes) + * OUTPUT + * - r: pointer to output polynomial, with + * each coefficient unsigned and in the range + * 0 .. 4095 + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) +); + + +#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) +/************************************************* + * Name: poly_frommsg + * + * Description: Convert 32-byte message to polynomial + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *msg: pointer to input message + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(object_whole(r)) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) +); + +#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) +/************************************************* + * Name: poly_tomsg + * + * Description: Convert polynomial to 32-byte message + * + * Arguments: - uint8_t *msg: pointer to output message + * - const poly *r: pointer to input polynomial + * Coefficients must be unsigned canonical + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(msg)) +); + +#endif /* COMPRESS_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h index fa89370ce..e975ede95 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h @@ -122,46 +122,87 @@ /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ /****************************************************************************** - * Name: MLKEM_USE_NATIVE + * Name: MLKEM_USE_NATIVE_BACKEND_ARITH * - * Description: Determines whether a native backend should - * be used, if available. + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. * * This can also be set using CFLAGS. * *****************************************************************************/ -#if !defined(MLKEM_USE_NATIVE) -/* #define MLKEM_USE_NATIVE */ +#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */ #endif /****************************************************************************** - * Name: MLKEM_NATIVE_ARITH_BACKEND + * Name: MLKEM_NATIVE_ARITH_BACKEND_FILE * * Description: The arithmetic backend to use. * - * This must be the filename of an arithmetic backend. - * See the existing backends for examples. + * If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND) -#define MLKEM_NATIVE_ARITH_BACKEND "default.h" -#endif /* MLKEM_NATIVE_ARITH_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \ + !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h" +#endif /****************************************************************************** - * Name: MLKEM_NATIVE_FIPS202_BACKEND + * Name: MLKEM_USE_NATIVE_BACKEND_FIPS202 + * + * Description: Determines whether an native FIPS202 backend should be used. + * + * The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is + * the performance bottleneck of SHA3 and SHAKE. + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the FIPS202 backend to be use is + * determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */ +#endif + +/****************************************************************************** + * Name: MLKEM_NATIVE_FIPS202_BACKEND_FILE * * Description: The FIPS-202 backend to use. * - * This must be the filename of an FIPS-202 backend. + * If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must + * either be undefined or the filename of a FIPS202 backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND) -#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h" -#endif /* MLKEM_NATIVE_FIPS202_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \ + !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h" +#endif /************************* Config internals ********************************/ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.c index 0cfcc3e9e..318d0fc77 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.c @@ -9,11 +9,10 @@ #include "fips202.h" #include "fips202x4.h" #include "indcpa.h" -#include "ntt.h" #include "poly.h" -#include "polyvec.h" +#include "poly_k.h" #include "randombytes.h" -#include "rej_uniform.h" +#include "sampling.h" #include "symmetric.h" #include "arith_backend.h" @@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v, #define poly_permute_bitrev_to_custom \ MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom) -static INLINE void poly_permute_bitrev_to_custom(poly *data) +static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N]) __contract__( /* We don't specify that this should be a permutation, but only * that it does not change the bound established at the end of gen_matrix. */ - requires(memory_no_alias(data, sizeof(poly))) - requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N)) + requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q)) assigns(memory_slice(data, sizeof(poly))) - ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } + ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ /* Not static for benchmarking */ @@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed) { for (j = 0; j < MLKEM_K; j++) { - poly_permute_bitrev_to_custom(&a[i].vec[j]); + poly_permute_bitrev_to_custom(a[i].vec[j].coeffs); } } } diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.h index 2c4fda3c4..b4d5985bf 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.h @@ -8,7 +8,7 @@ #include #include "cbmc.h" #include "common.h" -#include "polyvec.h" +#include "poly_k.h" #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix) /************************************************* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/api.h similarity index 90% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/api.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/api.h index 792ecb8a4..0704f9dcd 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/api.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/api.h @@ -23,8 +23,7 @@ #define MLKEM_NATIVE_ARITH_NATIVE_API_H #include -#include "poly.h" -#include "polyvec.h" +#include "../common.h" /* * This is the C<->native interface allowing for the drop-in of @@ -65,9 +64,9 @@ * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER * for more information. * - * Arguments: - poly *p: pointer to in/output polynomial + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial **************************************************/ -static INLINE void ntt_native(poly *); +static INLINE void ntt_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_NTT */ #if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER) @@ -96,10 +95,10 @@ and to/from bytes conversions." * * This must only be defined if there is native code for * all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache. - * Arguments: - poly *p: pointer to in/output polynomial + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial * **************************************************/ -static INLINE void poly_permute_bitrev_to_custom(poly *); +static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ #if defined(MLKEM_USE_NATIVE_INTT) @@ -117,7 +116,7 @@ static INLINE void poly_permute_bitrev_to_custom(poly *); * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -static INLINE void intt_native(poly *); +static INLINE void intt_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_INTT */ #if defined(MLKEM_USE_NATIVE_POLY_REDUCE) @@ -126,9 +125,9 @@ static INLINE void intt_native(poly *); * * Description: Applies modular reduction to all coefficients of a polynomial. * - * Arguments: - poly *r: pointer to input/output polynomial + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial **************************************************/ -static INLINE void poly_reduce_native(poly *); +static INLINE void poly_reduce_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ #if defined(MLKEM_USE_NATIVE_POLY_TOMONT) @@ -138,9 +137,9 @@ static INLINE void poly_reduce_native(poly *); * Description: Inplace conversion of all coefficients of a polynomial * from normal domain to Montgomery domain * - * Arguments: - poly *r: pointer to input/output polynomial + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial **************************************************/ -static INLINE void poly_tomont_native(poly *); +static INLINE void poly_tomont_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ #if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) @@ -165,8 +164,8 @@ static INLINE void poly_tomont_native(poly *); * OUTPUT * - cache: pointer to multiplication cache **************************************************/ -static INLINE void poly_mulcache_compute_native(poly_mulcache *cache, - const poly *poly); +static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2], + const int16_t poly[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ #if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) @@ -189,8 +188,9 @@ static INLINE void poly_mulcache_compute_native(poly_mulcache *cache, * in NTT domain, and of the same order as a and b. **************************************************/ static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache); + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]); #endif #if defined(MLKEM_USE_NATIVE_POLY_TOBYTES) @@ -209,7 +209,7 @@ static INLINE void polyvec_basemul_acc_montgomery_cached_native( * (of MLKEM_POLYBYTES bytes) **************************************************/ static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a); + const int16_t a[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ #if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) @@ -226,7 +226,7 @@ static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], * - a: const pointer to input byte aray * (of MLKEM_POLYBYTES bytes) **************************************************/ -static INLINE void poly_frombytes_native(poly *a, +static INLINE void poly_frombytes_native(int16_t a[MLKEM_N], const uint8_t r[MLKEM_POLYBYTES]); #endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/default.h similarity index 97% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/default.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/default.h index d1e41c52e..f9fe4310a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/default.h @@ -8,7 +8,7 @@ /* * Default arithmetic backend */ -#include "sys.h" +#include "../sys.h" #ifdef SYS_AARCH64 /* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/README.md similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/README.md rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/README.md diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/default.h similarity index 90% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/default.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/default.h index 592e8996d..73f53dc13 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/default.h @@ -19,6 +19,6 @@ /* Filename of the C backend implementation. * This is not inlined here because this header is included in assembly * files as well. */ -#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "x86_64/src/default_impl.h" +#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/x86_64/src/default_impl.h" #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/align.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/align.h similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/align.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/align.h diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/arith_native_x86_64.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/arith_native_x86_64.h similarity index 91% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/arith_native_x86_64.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/arith_native_x86_64.h index 25e00a930..acde977ad 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/arith_native_x86_64.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/arith_native_x86_64.h @@ -5,11 +5,10 @@ #ifndef MLKEM_X86_64_NATIVE_H #define MLKEM_X86_64_NATIVE_H -#include "common.h" +#include "../../../common.h" #include #include -#include "polyvec.h" #include "consts.h" #define REJ_UNIFORM_AVX_NBLOCKS 3 /* See MLKEM_GEN_MATRIX_NBLOCKS */ @@ -44,8 +43,9 @@ void basemul_avx2(__m256i *r, const __m256i *a, const __m256i *b, #define polyvec_basemul_acc_montgomery_cached_avx2 \ MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_avx2) void polyvec_basemul_acc_montgomery_cached_avx2( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache); + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]); #define ntttobytes_avx2 MLKEM_NAMESPACE(ntttobytes_avx2) void ntttobytes_avx2(uint8_t *r, const __m256i *a, const __m256i *qdata); diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.S rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S index b97840e70..5fdc3d0a0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S @@ -6,7 +6,7 @@ // Implementation from Kyber reference repository // https://github.com/pq-crystals/kyber/blob/main/avx2 -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) #include "consts.h" diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.c similarity index 51% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.c rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.c index 5f9ae99c8..970938306 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.c @@ -3,46 +3,46 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) -#include "poly.h" -#include "polyvec.h" - #include "arith_native_x86_64.h" #include "consts.h" -static void poly_basemul_montgomery_avx2(poly *r, const poly *a, const poly *b) +static void poly_basemul_montgomery_avx2(int16_t r[MLKEM_N], + const int16_t a[MLKEM_N], + const int16_t b[MLKEM_N]) { - basemul_avx2((__m256i *)r->coeffs, (const __m256i *)a->coeffs, - (const __m256i *)b->coeffs, qdata.vec); + basemul_avx2((__m256i *)r, (const __m256i *)a, (const __m256i *)b, qdata.vec); } /* * Implementation from Kyber reference repository * https://github.com/pq-crystals/kyber/blob/main/avx2 */ -static void poly_add_avx2(poly *r, const poly *a, const poly *b) +static void poly_add_avx2(int16_t r[MLKEM_N], const int16_t a[MLKEM_N], + const int16_t b[MLKEM_N]) { unsigned i; __m256i f0, f1; for (i = 0; i < MLKEM_N; i += 16) { - f0 = _mm256_load_si256((const __m256i *)&a->coeffs[i]); - f1 = _mm256_load_si256((const __m256i *)&b->coeffs[i]); + f0 = _mm256_load_si256((const __m256i *)&a[i]); + f1 = _mm256_load_si256((const __m256i *)&b[i]); f0 = _mm256_add_epi16(f0, f1); - _mm256_store_si256((__m256i *)&r->coeffs[i], f0); + _mm256_store_si256((__m256i *)&r[i], f0); } } -void polyvec_basemul_acc_montgomery_cached_avx2(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) +void polyvec_basemul_acc_montgomery_cached_avx2( + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]) { unsigned i; - poly t; + int16_t t[MLKEM_N] ALIGN; /* TODO: Use mulcache for AVX2. So far, it is unused. */ ((void)b_cache); @@ -50,11 +50,11 @@ void polyvec_basemul_acc_montgomery_cached_avx2(poly *r, const polyvec *a, /* Coefficient-wise bound of each basemul is 2q. * Since we are accumulating at most 4 times, the * overall bound is 8q < INT16_MAX. */ - poly_basemul_montgomery_avx2(r, &a->vec[0], &b->vec[0]); + poly_basemul_montgomery_avx2(r, &a[0], &b[0]); for (i = 1; i < MLKEM_K; i++) { - poly_basemul_montgomery_avx2(&t, &a->vec[i], &b->vec[i]); - poly_add_avx2(r, r, &t); + poly_basemul_montgomery_avx2(t, &a[i * MLKEM_N], &b[i * MLKEM_N]); + poly_add_avx2(r, r, t); } } diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.c similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.c rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.c index 86a0835ef..568752ae8 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.c @@ -8,7 +8,7 @@ * https://github.com/pq-crystals/kyber/blob/main/avx2/consts.c */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.h similarity index 97% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.h index 00c415952..e2846b609 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.h @@ -11,7 +11,7 @@ #ifndef CONSTS_H #define CONSTS_H -#include "common.h" +#include "../../../common.h" #define AVX2_BACKEND_DATA_OFFSET_16XQ 0 #define AVX2_BACKEND_DATA_OFFSET_16XQINV 16 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/default_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/default_impl.h similarity index 62% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/default_impl.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/default_impl.h index 029111c17..3683361e2 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/default_impl.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/default_impl.h @@ -12,8 +12,7 @@ #include -#include "poly.h" -#include "polyvec.h" +#include "../../../params.h" #include "arith_native_x86_64.h" #define MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER @@ -28,9 +27,9 @@ #define MLKEM_USE_NATIVE_POLY_TOBYTES #define MLKEM_USE_NATIVE_POLY_FROMBYTES -static INLINE void poly_permute_bitrev_to_custom(poly *data) +static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N]) { - nttunpack_avx2((__m256i *)(data->coeffs), qdata.vec); + nttunpack_avx2((__m256i *)(data), qdata.vec); } static INLINE int rej_uniform_native(int16_t *r, unsigned int len, @@ -45,27 +44,28 @@ static INLINE int rej_uniform_native(int16_t *r, unsigned int len, return (int)rej_uniform_avx2(r, buf); } -static INLINE void ntt_native(poly *data) +static INLINE void ntt_native(int16_t data[MLKEM_N]) { ntt_avx2((__m256i *)data, qdata.vec); } -static INLINE void intt_native(poly *data) +static INLINE void intt_native(int16_t data[MLKEM_N]) { invntt_avx2((__m256i *)data, qdata.vec); } -static INLINE void poly_reduce_native(poly *data) +static INLINE void poly_reduce_native(int16_t data[MLKEM_N]) { - reduce_avx2((__m256i *)data->coeffs, qdata.vec); + reduce_avx2((__m256i *)data, qdata.vec); } -static INLINE void poly_tomont_native(poly *data) +static INLINE void poly_tomont_native(int16_t data[MLKEM_N]) { - tomont_avx2((__m256i *)data->coeffs, qdata.vec); + tomont_avx2((__m256i *)data, qdata.vec); } -static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y) +static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2], + const int16_t y[MLKEM_N]) { /* AVX2 backend does not use mulcache */ ((void)y); @@ -73,22 +73,23 @@ static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y) } static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache) + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]) { polyvec_basemul_acc_montgomery_cached_avx2(r, a, b, b_cache); } static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a) + const int16_t a[MLKEM_N]) { - ntttobytes_avx2(r, (const __m256i *)a->coeffs, qdata.vec); + ntttobytes_avx2(r, (const __m256i *)a, qdata.vec); } -static INLINE void poly_frombytes_native(poly *r, +static INLINE void poly_frombytes_native(int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYBYTES]) { - nttfrombytes_avx2((__m256i *)r->coeffs, a, qdata.vec); + nttfrombytes_avx2((__m256i *)r, a, qdata.vec); } #endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S similarity index 98% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.S rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S index 134bd4f71..3f013a5fa 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S @@ -11,7 +11,7 @@ // in [0,1,...,q-1] rather than [0,1,...,q], matching the // semantics of poly_reduce(). -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) #include "consts.h" diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.inc b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.inc similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.inc rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.inc diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/intt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S index 6b1d78ef2..7b1f22624 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/intt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S @@ -9,7 +9,7 @@ * Changes to placement of modular reductions have * been made to simplify reasoning of non-overflow */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/ntt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S index e8bf7894b..5d928b4cc 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/ntt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S @@ -6,7 +6,7 @@ // Implementation from Kyber reference repository // https://github.com/pq-crystals/kyber/blob/main/avx2 -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) #include "consts.h" diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_avx2.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_avx2.c similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_avx2.c rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_avx2.c index 54037a0df..adf2d338b 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_avx2.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_avx2.c @@ -8,7 +8,7 @@ * https://github.com/pq-crystals/kyber/blob/main/avx2 */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_table.c similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_table.c rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_table.c index 9bbc47146..e95fd9e79 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_table.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_table.c @@ -8,7 +8,7 @@ * Do not modify it directly. */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.S rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S index 5e708748a..9bcd04896 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S @@ -6,7 +6,7 @@ // Implementation from Kyber reference repository // https://github.com/pq-crystals/kyber/blob/main/avx2 -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.inc b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.inc similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.inc rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.inc diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/x86_64_zetas.i b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/x86_64_zetas.i similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/x86_64_zetas.i rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/x86_64_zetas.i diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.c deleted file mode 100644 index 3651c8da9..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) - -#include -#include "arith_backend.h" -#include "debug.h" -#include "ntt.h" -#include "reduce.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) -#define ntt_layer MLKEM_NAMESPACE(ntt_layer) -#define invntt_layer MLKEM_NAMESPACE(invntt_layer) -/* End of static namespacing */ - -#if !defined(MLKEM_USE_NATIVE_NTT) -/* - * Computes a block CT butterflies with a fixed twiddle factor, - * using Montgomery multiplication. - * Parameters: - * - r: Pointer to base of polynomial (_not_ the base of butterfly block) - * - root: Twiddle factor to use for the butterfly. This must be in - * Montgomery form and signed canonical. - * - start: Offset to the beginning of the butterfly block - * - len: Index difference between coefficients subject to a butterfly - * - bound: Ghost variable describing coefficient bound: Prior to `start`, - * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, - * they must be bound by `bound`. - * When this function returns, output coefficients in the index range - * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. - * Example: - * - start=8, len=4 - * This would compute the following four butterflies - * 8 -- 12 - * 9 -- 13 - * 10 -- 14 - * 11 -- 15 - * - start=4, len=2 - * This would compute the following two butterflies - * 4 -- 6 - * 5 -- 7 - */ -static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, - unsigned start, unsigned len, int bound) -__contract__( - requires(start < MLKEM_N) - requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) - requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) - requires(-HALF_Q < zeta && zeta < HALF_Q) - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) - requires(array_abs_bound(r, start, MLKEM_N, bound)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) - ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) -{ - /* `bound` is a ghost variable only needed in the CBMC specification */ - unsigned j; - ((void)bound); - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - /* - * Coefficients are updated in strided pairs, so the bounds for the - * intermediate states alternate twice between the old and new bound - */ - invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j, start + len, bound)) - invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) - { - int16_t t; - t = fqmul(r[j + len], zeta); - r[j + len] = r[j] - t; - r[j] = r[j] + t; - } -} - -/* - *Compute one layer of forward NTT - * Parameters: - * - r: Pointer to base of polynomial - * - len: Stride of butterflies in this layer. - * - layer: Ghost variable indicating which layer is being applied. - * Must match `len` via `len == MLKEM_N >> layer`. - * Note: `len` could be dropped and computed in the function, but - * we are following the structure of the reference NTT from the - * official Kyber implementation here, merely adding `layer` as - * a ghost variable for the specifications. - */ -static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) - requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable only needed in the CBMC specification */ - ((void)layer); - /* Twiddle factors for layer n start at index 2^(layer-1) */ - k = MLKEM_N / (2 * len); - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(start < MLKEM_N + 2 * len) - invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) - invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) - invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) - { - int16_t zeta = zetas[k++]; - ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); - } -} - -/* - * Compute full forward NTT - * NOTE: This particular implementation satisfies a much tighter - * bound on the output coefficients (5*q) than the contractual one (8*q), - * but this is not needed in the calling code. Should we change the - * base multiplication strategy to require smaller NTT output bounds, - * the proof may need strengthening. - */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - unsigned len, layer; - int16_t *r; - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - r = p->coeffs; - - for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) - __loop__( - invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) - invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) - { - ntt_layer(r, len, layer); - } - - /* Check the stronger bound */ - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_NTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - ntt_native(p); - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_NTT */ - -#if !defined(MLKEM_USE_NATIVE_INTT) - -/* Compute one layer of inverse NTT */ -static void invntt_layer(int16_t *r, unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) - requires(len == (1 << (8 - layer))) - requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable used only in the specification */ - ((void)layer); - k = MLKEM_N / len - 1; - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - invariant(start <= MLKEM_N && k <= 127) - /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ - invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) - { - unsigned j; - int16_t zeta = zetas[k--]; - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - invariant(start <= MLKEM_N && k <= 127) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - int16_t t = r[j]; - r[j] = barrett_reduce(t + r[j + len]); - r[j + len] = r[j + len] - t; - r[j + len] = fqmul(r[j + len], zeta); - } - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - /* - * Scale input polynomial to account for Montgomery factor - * and NTT twist. This also brings coefficients down to - * absolute value < MLKEM_Q. - */ - unsigned j, len, layer; - const int16_t f = 1441; - int16_t *r = p->coeffs; - - for (j = 0; j < MLKEM_N; j++) - __loop__( - invariant(j <= MLKEM_N) - invariant(array_abs_bound(r, 0, j, MLKEM_Q))) - { - r[j] = fqmul(r[j], f); - } - - /* Run the invNTT layers */ - for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) - __loop__( - invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - invntt_layer(p->coeffs, len, layer); - } - - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - intt_native(p); - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -{ - int32_t t0, t1; - debug_assert_bound(a, 2, 0, UINT12_LIMIT); - - t0 = (int32_t)a[1] * b_cached; - t0 += (int32_t)a[0] * b[0]; - t1 = (int32_t)a[0] * b[1]; - t1 += (int32_t)a[1] * b[0]; - - /* |ti| < 2 * q * 2^15 */ - r[0] = montgomery_reduce(t0); - r[1] = montgomery_reduce(t1); - - debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); -} - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt) -int empty_cu_ntt; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.h deleted file mode 100644 index 4e80d3ab3..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef NTT_H -#define NTT_H -#include "common.h" - -#include -#include "cbmc.h" -#include "poly.h" -#include "reduce.h" - -#define zetas MLKEM_NAMESPACE(zetas) -extern const int16_t zetas[128]; - -#define poly_ntt MLKEM_NAMESPACE(poly_ntt) -/************************************************* - * Name: poly_ntt - * - * Description: Computes negacyclic number-theoretic transform (NTT) of - * a polynomial in place. - * - * The input is assumed to be in normal order and - * coefficient-wise bound by MLKEM_Q in absolute value. - * - * The output polynomial is in bitreversed order, and - * coefficient-wise bound by NTT_BOUND in absolute value. - * - * (NOTE: Sometimes the input to the NTT is actually smaller, - * which gives better bounds.) - * - * Arguments: - poly *p: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) -); - -#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) -/************************************************* - * Name: poly_invntt_tomont - * - * Description: Computes inverse of negacyclic number-theoretic transform (NTT) - * of a polynomial in place; - * inputs assumed to be in bitreversed order, output in normal - * order - * - * The input is assumed to be in bitreversed order, and can - * have arbitrary coefficients in int16_t. - * - * The output polynomial is in normal order, and - * coefficient-wise bound by INVNTT_BOUND in absolute value. - * - * Arguments: - uint16_t *a: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) -); - -#define basemul_cached MLKEM_NAMESPACE(basemul_cached) -/************************************************************ - * Name: basemul_cached - * - * Description: Computes a representative modulo q of - * (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536 - * - * If b_cached is b1*zeta, this represents the - * product of (a0 + a1*X) and (b0 + b1*X) in - * Fq[X]/(X^2 - zeta). - * - * Arguments: - r: Pointer to output polynomial - * Upon return, coefficients are bound by - * 2*MLKEM_Q in absolute value. - * - a: Pointer to first input polynomial - * Every coefficient must be in [0..4095] - * - b: Pointer to second input polynomial - * Can have arbitrary int16_t coefficients - * - b_cached: Some precomputed value, typically derived from - * b1 and a twiddle factor. Can be an arbitary int16_t. - ************************************************************/ -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -__contract__( - requires(memory_no_alias(r, 2 * sizeof(int16_t))) - requires(memory_no_alias(a, 2 * sizeof(int16_t))) - requires(memory_no_alias(b, 2 * sizeof(int16_t))) - requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) - assigns(memory_slice(r, 2 * sizeof(int16_t))) - ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)) -); - -#endif /* NTT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/params.h index 57ea4c8ba..7f6c12625 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/params.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/params.h @@ -18,6 +18,7 @@ #define MLKEM_N 256 #define MLKEM_Q 3329 #define UINT12_LIMIT 4096 +#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define MLKEM_SSBYTES 32 /* size in bytes of shared key */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.c index 7483ebf6d..e8a2e2c6e 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.c @@ -8,388 +8,246 @@ #include #include #include "arith_backend.h" -#include "cbd.h" #include "cbmc.h" #include "debug.h" #include "fips202x4.h" -#include "ntt.h" #include "poly.h" -#include "reduce.h" +#include "sampling.h" #include "symmetric.h" #include "verify.h" -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 16))) - { - t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); - } - - r[i * 4] = t[0] | (t[1] << 4); - r[i * 4 + 1] = t[2] | (t[3] << 4); - r[i * 4 + 2] = t[4] | (t[5] << 4); - r[i * 4 + 3] = t[6] | (t[7] << 4); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - for (j = 0; j < MLKEM_N / 4; j++) - __loop__(invariant(j <= MLKEM_N / 4)) - { - unsigned k; - uint16_t t[4]; - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(forall(r, 0, k, t[r] < (1u << 10)))) - { - t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); - } - - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 10-bit in size. - */ - r[5 * j + 0] = (t[0] >> 0) & 0xFF; - r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); - r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); - r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); - r[5 * j + 4] = (t[3] >> 2); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) - { - r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); - r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) +#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) +#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) +#define fqmul MLKEM_NAMESPACE(fqmul) +#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) +#define basemul_cached MLKEM_NAMESPACE(basemul_cached) +#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) +#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) +#define ntt_layer MLKEM_NAMESPACE(ntt_layer) +#define invntt_layer MLKEM_NAMESPACE(invntt_layer) +/* End of static namespacing */ + +/************************************************* + * Name: cast_uint16_to_int16 + * + * Description: Cast uint16 value to int16 + * + * Returns: + * input x in 0 .. 32767: returns value unchanged + * input x in 32768 .. 65535: returns (x - 65536) + **************************************************/ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "conversion" +#endif +ALWAYS_INLINE +static INLINE int16_t cast_uint16_to_int16(uint16_t x) { - unsigned j; - for (j = 0; j < MLKEM_N / 4; j++) - __loop__( - invariant(j <= MLKEM_N / 4) - invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[4]; - uint8_t const *base = &a[5 * j]; - - t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); - t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); - t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); - - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) - { - r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + /* + * PORTABILITY: This relies on uint16_t -> int16_t + * being implemented as the inverse of int16_t -> uint16_t, + * which is implementation-defined (C99 6.3.1.3 (3)) + * CBMC (correctly) fails to prove this conversion is OK, + * so we have to suppress that check here + */ + return (int16_t)x; } -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ +#ifdef CBMC +#pragma CPROVER check pop +#endif -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +/************************************************* + * Name: montgomery_reduce_generic + * + * Description: Generic Montgomery reduction; given a 32-bit integer a, computes + * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 + * + * Arguments: - int32_t a: input integer to be reduced + * + * Returns: integer congruent to a * R^-1 modulo q, with absolute value + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 + * + **************************************************/ +ALWAYS_INLINE +static INLINE int16_t montgomery_reduce_generic(int32_t a) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ + const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 32))) - { - t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); - } + /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ + const uint16_t a_reduced = a & UINT16_MAX; + const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC, and use array indexing into - * r rather than pointer-arithmetic to simplify verification - */ - r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); - r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); - r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); - r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); - r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); - } -} + /* Lift to signed canonical representative mod 2^16. */ + const int16_t t = cast_uint16_to_int16(a_inverted); -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + int32_t r = a - ((int32_t)t * MLKEM_Q); + /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - for (j = 0; j < MLKEM_N / 8; j++) - __loop__(invariant(j <= MLKEM_N / 8)) - { - unsigned k; - uint16_t t[8]; - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(forall(r, 0, k, t[r] < (1u << 11)))) - { - t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); - } + /* + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + r = r >> 16; + /* Bounds: |r >> 16| <= ceil(|r| / 2^16) + * <= ceil(|a| / 2^16 + MLKEM_Q / 2) + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * + * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) + */ - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 11-bit in size. - */ - r[11 * j + 0] = (t[0] >> 0) & 0xFF; - r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); - r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); - r[11 * j + 3] = (t[2] >> 2) & 0xFF; - r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); - r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); - r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); - r[11 * j + 7] = (t[5] >> 1) & 0xFF; - r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); - r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); - r[11 * j + 10] = (t[7] >> 3); - } + return (int16_t)r; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +/************************************************* + * Name: montgomery_reduce + * + * Description: Montgomery reduction + * + * Arguments: - int32_t a: input integer to be reduced + * Must be smaller than 2 * 2^12 * 2^15 in absolute value. + * + * Returns: integer congruent to a * R^-1 modulo q, + * smaller than 2 * q in absolute value. + **************************************************/ +static INLINE int16_t montgomery_reduce(int32_t a) +__contract__( + requires(a > -(2 * UINT12_LIMIT * 32768)) + requires(a < (2 * UINT12_LIMIT * 32768)) + ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) +) { - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - uint8_t t[8]; - const unsigned offset = i * 5; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC and unwind loop for ease - * of proof. - */ - - /* - * Decompress 5 8-bit bytes (so 40 bits) into - * 8 5-bit values stored in t[] - */ - t[0] = 0x1F & (a[offset + 0] >> 0); - t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); - t[2] = 0x1F & (a[offset + 1] >> 2); - t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); - t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); - t[5] = 0x1F & (a[offset + 3] >> 1); - t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); - t[7] = 0x1F & (a[offset + 4] >> 3); - - /* and copy to the correct slice in r[] */ - for (j = 0; j < 8; j++) - __loop__( - invariant(j <= 8 && i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + int16_t res; + debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); + + res = montgomery_reduce_generic(a); + /* Bounds: + * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 + * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 + * < 2 * MLKEM_Q */ + + debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); + return res; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: fqmul + * + * Description: Montgomery multiplication modulo q=3329 + * + * Arguments: - int16_t a: first factor + * Can be any int16_t. + * - int16_t b: second factor. + * Must be signed canonical (abs value <(q+1)/2) + * + * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and + * smaller than q in absolute value. + * + **************************************************/ +static INLINE int16_t fqmul(int16_t a, int16_t b) +__contract__( + requires(b > -HALF_Q) + requires(b < HALF_Q) + ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) +) { - unsigned j; - for (j = 0; j < MLKEM_N / 8; j++) - __loop__( - invariant(j <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[8]; - uint8_t const *base = &a[11 * j]; - t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); - t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | - ((uint16_t)base[4] << 10)); - t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); - t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); - t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | - ((uint16_t)base[8] << 9)); - t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); - t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); - - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) - { - r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); - } - } + int16_t res; + debug_assert_abs_bound(&b, 1, HALF_Q); + + res = montgomery_reduce((int32_t)a * (int32_t)b); + /* Bounds: + * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 + * < MLKEM_Q + */ - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + debug_assert_abs_bound(&res, 1, MLKEM_Q); + return res; } -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ - -#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: barrett_reduce + * + * Description: Barrett reduction; given a 16-bit integer a, computes + * centered representative congruent to a mod q in + * {-(q-1)/2,...,(q-1)/2} + * + * Arguments: - int16_t a: input integer to be reduced + * + * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. + **************************************************/ +static INLINE int16_t barrett_reduce(int16_t a) +__contract__( + ensures(return_value > -HALF_Q && return_value < HALF_Q) +) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 2; i++) - __loop__(invariant(i <= MLKEM_N / 2)) - { - const uint16_t t0 = a->coeffs[2 * i]; - const uint16_t t1 = a->coeffs[2 * i + 1]; - /* - * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of - * significant data, so these can be packed into 24 bits or exactly - * 3 bytes, as follows. - */ - - /* Least significant bits 0 - 7 of t0. */ - r[3 * i + 0] = t0 & 0xFF; - - /* - * Most significant bits 8 - 11 of t0 become the least significant - * nibble of the second byte. The least significant 4 bits - * of t1 become the upper nibble of the second byte. - */ - r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + /* + * To divide by MLKEM_Q using Barrett multiplication, the "magic number" + * multiplier is round_to_nearest(2**26/MLKEM_Q) + */ + const int BPOWER = 26; + const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - /* Bits 4 - 11 of t1 become the third byte. */ - r[3 * i + 2] = t1 >> 4; - } -} -#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -{ - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - poly_tobytes_native(r, a); -} -#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + /* + * Compute round_to_nearest(a/MLKEM_Q) using the multiplier + * above and shift by BPOWER places. + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; -#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) - { - const uint8_t t0 = a[3 * i + 0]; - const uint8_t t1 = a[3 * i + 1]; - const uint8_t t2 = a[3 * i + 2]; - r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); - r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); - } + /* + * t is in -10 .. +10, so we need 32-bit math to + * evaluate t * MLKEM_Q and the subsequent subtraction + */ + int16_t res = (int16_t)(a - t * MLKEM_Q); - /* Note that the coefficients are not canonical */ - debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); -} -#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - poly_frombytes_native(r, a); + debug_assert_abs_bound(&res, 1, HALF_Q); + return res; } -#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ - -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], + int16_t b_cached) +__contract__( + requires(memory_no_alias(r, 2 * sizeof(int16_t))) + requires(memory_no_alias(a, 2 * sizeof(int16_t))) + requires(memory_no_alias(b, 2 * sizeof(int16_t))) + requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) + assigns(memory_slice(r, 2 * sizeof(int16_t))) + ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))) { - unsigned i; -#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) -#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" -#endif + int32_t t0, t1; + debug_assert_bound(a, 2, 0, UINT12_LIMIT); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - for (j = 0; j < 8; j++) - __loop__( - invariant(i < MLKEM_N / 8 && j <= 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - /* Prevent the compiler from recognizing this as a bit selection */ - uint8_t mask = value_barrier_u8(1u << j); - r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); - } - } - debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); -} + t0 = (int32_t)a[1] * b_cached; + t0 += (int32_t)a[0] * b[0]; + t1 = (int32_t)a[0] * b[1]; + t1 += (int32_t)a[1] * b[0]; -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* |ti| < 2 * q * 2^15 */ + r[0] = montgomery_reduce(t0); + r[1] = montgomery_reduce(t1); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - msg[i] = 0; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8)) - { - uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); - msg[i] |= t << j; - } - } + debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); } MLKEM_NATIVE_INTERNAL_API @@ -434,12 +292,46 @@ void poly_tomont(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_tomont(poly *r) { - poly_tomont_native(r); + poly_tomont_native(r->coeffs); debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) +/************************************************************ + * Name: scalar_signed_to_unsigned_q + * + * Description: converts signed polynomial coefficient + * from signed (-3328 .. 3328) form to + * unsigned form (0 .. 3328). + * + * Note: Cryptographic constant time implementation + * + * Examples: 0 -> 0 + * 1 -> 1 + * 3328 -> 3328 + * -1 -> 3328 + * -2 -> 3327 + * -3328 -> 1 + * + * Arguments: c: signed coefficient to be converted + ************************************************************/ +static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) +__contract__( + requires(c > -MLKEM_Q && c < MLKEM_Q) + ensures(return_value >= 0 && return_value < MLKEM_Q) + ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) +{ + debug_assert_abs_bound(&c, 1, MLKEM_Q); + + /* Add Q if c is negative, but in constant time */ + c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); + + /* and therefore cast to uint16_t is safe. */ + debug_assert_bound(&c, 1, 0, MLKEM_Q); + return (uint16_t)c; +} + MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { @@ -461,7 +353,7 @@ void poly_reduce(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { - poly_reduce_native(r); + poly_reduce_native(r->coeffs); debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ @@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a) MLKEM_NATIVE_INTERNAL_API void poly_mulcache_compute(poly_mulcache *x, const poly *a) { - poly_mulcache_compute_native(x, a); + poly_mulcache_compute_native(x->coeffs, a->coeffs); /* Omitting bounds assertion since native implementations may * decide not to use a mulcache. Note that the C backend implementation * of poly_basemul_montgomery_cached() does still include the check. */ } #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ +#if !defined(MLKEM_USE_NATIVE_NTT) +/* + * Computes a block CT butterflies with a fixed twiddle factor, + * using Montgomery multiplication. + * Parameters: + * - r: Pointer to base of polynomial (_not_ the base of butterfly block) + * - root: Twiddle factor to use for the butterfly. This must be in + * Montgomery form and signed canonical. + * - start: Offset to the beginning of the butterfly block + * - len: Index difference between coefficients subject to a butterfly + * - bound: Ghost variable describing coefficient bound: Prior to `start`, + * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, + * they must be bound by `bound`. + * When this function returns, output coefficients in the index range + * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. + * Example: + * - start=8, len=4 + * This would compute the following four butterflies + * 8 -- 12 + * 9 -- 13 + * 10 -- 14 + * 11 -- 15 + * - start=4, len=2 + * This would compute the following two butterflies + * 4 -- 6 + * 5 -- 7 + */ +static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, + unsigned start, unsigned len, int bound) +__contract__( + requires(start < MLKEM_N) + requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) + requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) + requires(-HALF_Q < zeta && zeta < HALF_Q) + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) + requires(array_abs_bound(r, start, MLKEM_N, bound)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) + ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) +{ + /* `bound` is a ghost variable only needed in the CBMC specification */ + unsigned j; + ((void)bound); + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + /* + * Coefficients are updated in strided pairs, so the bounds for the + * intermediate states alternate twice between the old and new bound + */ + invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j, start + len, bound)) + invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) + { + int16_t t; + t = fqmul(r[j + len], zeta); + r[j + len] = r[j] - t; + r[j] = r[j] + t; + } +} + +/* + *Compute one layer of forward NTT + * Parameters: + * - r: Pointer to base of polynomial + * - len: Stride of butterflies in this layer. + * - layer: Ghost variable indicating which layer is being applied. + * Must match `len` via `len == MLKEM_N >> layer`. + * Note: `len` could be dropped and computed in the function, but + * we are following the structure of the reference NTT from the + * official Kyber implementation here, merely adding `layer` as + * a ghost variable for the specifications. + */ +static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) + requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable only needed in the CBMC specification */ + ((void)layer); + /* Twiddle factors for layer n start at index 2^(layer-1) */ + k = MLKEM_N / (2 * len); + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(start < MLKEM_N + 2 * len) + invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) + invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) + invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) + { + int16_t zeta = zetas[k++]; + ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); + } +} + +/* + * Compute full forward NTT + * NOTE: This particular implementation satisfies a much tighter + * bound on the output coefficients (5*q) than the contractual one (8*q), + * but this is not needed in the calling code. Should we change the + * base multiplication strategy to require smaller NTT output bounds, + * the proof may need strengthening. + */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + unsigned len, layer; + int16_t *r; + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + r = p->coeffs; + + for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) + __loop__( + invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) + invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) + { + ntt_layer(r, len, layer); + } + + /* Check the stronger bound */ + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_NTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + ntt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_NTT */ + +#if !defined(MLKEM_USE_NATIVE_INTT) + +/* Compute one layer of inverse NTT */ +static void invntt_layer(int16_t *r, unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) + requires(len == (1 << (8 - layer))) + requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable used only in the specification */ + ((void)layer); + k = MLKEM_N / len - 1; + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + invariant(start <= MLKEM_N && k <= 127) + /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ + invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) + { + unsigned j; + int16_t zeta = zetas[k--]; + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + invariant(start <= MLKEM_N && k <= 127) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + int16_t t = r[j]; + r[j] = barrett_reduce(t + r[j + len]); + r[j + len] = r[j + len] - t; + r[j + len] = fqmul(r[j + len], zeta); + } + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + /* + * Scale input polynomial to account for Montgomery factor + * and NTT twist. This also brings coefficients down to + * absolute value < MLKEM_Q. + */ + unsigned j, len, layer; + const int16_t f = 1441; + int16_t *r = p->coeffs; + + for (j = 0; j < MLKEM_N; j++) + __loop__( + invariant(j <= MLKEM_N) + invariant(array_abs_bound(r, 0, j, MLKEM_Q))) + { + r[j] = fqmul(r[j], f); + } + + /* Run the invNTT layers */ + for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) + __loop__( + invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + invntt_layer(p->coeffs, len, layer); + } + + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_INTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + intt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_INTT */ + #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.h index 6a14c785d..cb0d67c1a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.h @@ -9,7 +9,7 @@ #include #include "cbmc.h" #include "common.h" -#include "reduce.h" +#include "debug.h" #include "verify.h" /* Absolute exclusive upper bound for the output of the inverse NTT */ @@ -18,6 +18,9 @@ /* Absolute exclusive upper bound for the output of the forward NTT */ #define NTT_BOUND (8 * MLKEM_Q) +#define zetas MLKEM_NAMESPACE(zetas) +extern const int16_t zetas[128]; + /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1] @@ -38,520 +41,6 @@ typedef struct int16_t coeffs[MLKEM_N >> 1]; } poly_mulcache; -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) -#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) -#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) -#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) -#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) -#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) -#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) -#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) -#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) -#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) -/* End of static namespacing */ - -/************************************************************ - * Name: scalar_compress_d1 - * - * Description: Computes round(u * 2 / q) - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 1. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d1(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 2) - ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) -{ - uint32_t d0 = u << 1; - d0 *= 645083; - d0 += 1u << 30; - d0 >>= 31; - return d0; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_compress_d4 - * - * Description: Computes round(u * 16 / q) % 16 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d4(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 16) - ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) -{ - uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ - return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d4 - * - * Description: Computes round(u * q / 16) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d4(uint32_t u) -__contract__( - requires(0 <= u && u < 16) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 8) / 16; } - -/************************************************************ - * Name: scalar_compress_d5 - * - * Description: Computes round(u * 32 / q) % 32 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d5(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 32) - ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) -{ - uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ - return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d5 - * - * Description: Computes round(u * q / 32) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo 32 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d5(uint32_t u) -__contract__( - requires(0 <= u && u < 32) - ensures(return_value <= MLKEM_Q - 1) -) { return ((u * MLKEM_Q) + 16) / 32; } - -/************************************************************ - * Name: scalar_compress_d10 - * - * Description: Computes round(u * 2**10 / q) % 2**10 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d10(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 10)) - ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) -{ - uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x3FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d10 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d10(uint32_t u) -__contract__( - requires(0 <= u && u < 1024) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 512) / 1024; } - -/************************************************************ - * Name: scalar_compress_d11 - * - * Description: Computes round(u * 2**11 / q) % 2**11 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 11. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d11(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 11)) - ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) -{ - uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x7FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d11 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d11(uint32_t u) -__contract__( - requires(0 <= u && u < 2048) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 1024) / 2048; } - -/************************************************************ - * Name: scalar_signed_to_unsigned_q - * - * Description: converts signed polynomial coefficient - * from signed (-3328 .. 3328) form to - * unsigned form (0 .. 3328). - * - * Note: Cryptographic constant time implementation - * - * Examples: 0 -> 0 - * 1 -> 1 - * 3328 -> 3328 - * -1 -> 3328 - * -2 -> 3327 - * -3328 -> 1 - * - * Arguments: c: signed coefficient to be converted - ************************************************************/ -static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) -__contract__( - requires(c > -MLKEM_Q && c < MLKEM_Q) - ensures(return_value >= 0 && return_value < MLKEM_Q) - ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) -{ - debug_assert_abs_bound(&c, 1, MLKEM_Q); - - /* Add Q if c is negative, but in constant time */ - c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); - - /* and therefore cast to uint16_t is safe. */ - debug_assert_bound(&c, 1, 0, MLKEM_Q); - return (uint16_t)c; -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ - (MLKEM_K == 2 || MLKEM_K == 3) -#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) -/************************************************* - * Name: poly_compress_d4 - * - * Description: Compression (4 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); - -#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) -/************************************************* - * Name: poly_compress_d10 - * - * Description: Compression (10 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); - -#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) -/************************************************* - * Name: poly_decompress_d4 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); - -#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) -/************************************************* - * Name: poly_decompress_d10 - * - * Description: De-serialization and subsequent decompression (10 bits) of a - * polynomial; approximate inverse of poly_compress_d10 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) -/************************************************* - * Name: poly_compress_d5 - * - * Description: Compression (5 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); - -#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) -/************************************************* - * Name: poly_compress_d11 - * - * Description: Compression (11 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); - -#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) -/************************************************* - * Name: poly_decompress_d5 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); - -#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) -/************************************************* - * Name: poly_decompress_d11 - * - * Description: De-serialization and subsequent decompression (11 bits) of a - * polynomial; approximate inverse of poly_compress_d11 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ - */ - -#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) -/************************************************* - * Name: poly_tobytes - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - a: const pointer to input polynomial, - * with each coefficient in the range [0,1,..,Q-1] - * OUTPUT - * - r: pointer to output byte array - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYBYTES)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(r)) -); - - -#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) -/************************************************* - * Name: poly_frombytes - * - * Description: De-serialization of a polynomial. - * - * Arguments: INPUT - * - a: pointer to input byte array - * (of MLKEM_POLYBYTES bytes) - * OUTPUT - * - r: pointer to output polynomial, with - * each coefficient unsigned and in the range - * 0 .. 4095 - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) -); - - -#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) -/************************************************* - * Name: poly_frommsg - * - * Description: Convert 32-byte message to polynomial - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *msg: pointer to input message - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(object_whole(r)) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) -); - -#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) -/************************************************* - * Name: poly_tomsg - * - * Description: Convert polynomial to 32-byte message - * - * Arguments: - uint8_t *msg: pointer to output message - * - const poly *r: pointer to input polynomial - * Coefficients must be unsigned canonical - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(msg)) -); - #define poly_basemul_montgomery_cached \ MLKEM_NAMESPACE(poly_basemul_montgomery_cached) /************************************************* @@ -715,4 +204,56 @@ __contract__( assigns(object_whole(r)) ); +#define poly_ntt MLKEM_NAMESPACE(poly_ntt) +/************************************************* + * Name: poly_ntt + * + * Description: Computes negacyclic number-theoretic transform (NTT) of + * a polynomial in place. + * + * The input is assumed to be in normal order and + * coefficient-wise bound by MLKEM_Q in absolute value. + * + * The output polynomial is in bitreversed order, and + * coefficient-wise bound by NTT_BOUND in absolute value. + * + * (NOTE: Sometimes the input to the NTT is actually smaller, + * which gives better bounds.) + * + * Arguments: - poly *p: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) +); + +#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) +/************************************************* + * Name: poly_invntt_tomont + * + * Description: Computes inverse of negacyclic number-theoretic transform (NTT) + * of a polynomial in place; + * inputs assumed to be in bitreversed order, output in normal + * order + * + * The input is assumed to be in bitreversed order, and can + * have arbitrary coefficients in int16_t. + * + * The output polynomial is in normal order, and + * coefficient-wise bound by INVNTT_BOUND in absolute value. + * + * Arguments: - uint16_t *a: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) +); + #endif /* POLY_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly_k.c similarity index 97% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.c rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly_k.c index 50ea1c34a..c2d330ea9 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly_k.c @@ -2,13 +2,12 @@ * Copyright (c) 2024 The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 */ -#include "polyvec.h" +#include "poly_k.h" #include #include #include "arith_backend.h" -#include "cbd.h" -#include "ntt.h" -#include "poly.h" +#include "compress.h" +#include "sampling.h" #include "symmetric.h" #include "debug.h" @@ -131,7 +130,9 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, /* Omitting bounds assertion for cache since native implementations may * decide not to use a mulcache. Note that the C backend implementation * of poly_basemul_montgomery_cached() does still include the check. */ - polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache); + polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a, + (const int16_t *)b, + (const int16_t *)b_cache); } #endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly_k.h similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly_k.h index 8be8579e0..0aea95912 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly_k.h @@ -2,11 +2,12 @@ * Copyright (c) 2024 The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 */ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef POLY_K_H +#define POLY_K_H #include #include "common.h" +#include "compress.h" #include "poly.h" #define polyvec MLKEM_NAMESPACE_K(polyvec) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/reduce.h deleted file mode 100644 index b432a4201..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/reduce.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef REDUCE_H -#define REDUCE_H - -#include -#include "cbmc.h" -#include "common.h" -#include "debug.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) -#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) -#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) -#define fqmul MLKEM_NAMESPACE(fqmul) -#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) -/* End of static namespacing */ - -#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ - -/************************************************* - * Name: cast_uint16_to_int16 - * - * Description: Cast uint16 value to int16 - * - * Returns: - * input x in 0 .. 32767: returns value unchanged - * input x in 32768 .. 65535: returns (x - 65536) - **************************************************/ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "conversion" -#endif -ALWAYS_INLINE -static INLINE int16_t cast_uint16_to_int16(uint16_t x) -{ - /* - * PORTABILITY: This relies on uint16_t -> int16_t - * being implemented as the inverse of int16_t -> uint16_t, - * which is implementation-defined (C99 6.3.1.3 (3)) - * CBMC (correctly) fails to prove this conversion is OK, - * so we have to suppress that check here - */ - return (int16_t)x; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************* - * Name: montgomery_reduce_generic - * - * Description: Generic Montgomery reduction; given a 32-bit integer a, computes - * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 - * - * Arguments: - int32_t a: input integer to be reduced - * - * Returns: integer congruent to a * R^-1 modulo q, with absolute value - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 - * - **************************************************/ -ALWAYS_INLINE -static INLINE int16_t montgomery_reduce_generic(int32_t a) -{ - /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ - const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - - /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ - const uint16_t a_reduced = a & UINT16_MAX; - const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - - /* Lift to signed canonical representative mod 2^16. */ - const int16_t t = cast_uint16_to_int16(a_inverted); - - int32_t r = a - ((int32_t)t * MLKEM_Q); - /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - - /* - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - r = r >> 16; - /* Bounds: |r >> 16| <= ceil(|r| / 2^16) - * <= ceil(|a| / 2^16 + MLKEM_Q / 2) - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * - * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) - */ - - return (int16_t)r; -} - -/************************************************* - * Name: montgomery_reduce - * - * Description: Montgomery reduction - * - * Arguments: - int32_t a: input integer to be reduced - * Must be smaller than 2 * 2^12 * 2^15 in absolute value. - * - * Returns: integer congruent to a * R^-1 modulo q, - * smaller than 2 * q in absolute value. - **************************************************/ -static INLINE int16_t montgomery_reduce(int32_t a) -__contract__( - requires(a > -(2 * UINT12_LIMIT * 32768)) - requires(a < (2 * UINT12_LIMIT * 32768)) - ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); - - res = montgomery_reduce_generic(a); - /* Bounds: - * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 - * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 - * < 2 * MLKEM_Q */ - - debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); - return res; -} - -/************************************************* - * Name: fqmul - * - * Description: Montgomery multiplication modulo q=3329 - * - * Arguments: - int16_t a: first factor - * Can be any int16_t. - * - int16_t b: second factor. - * Must be signed canonical (abs value <(q+1)/2) - * - * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and - * smaller than q in absolute value. - * - **************************************************/ -static INLINE int16_t fqmul(int16_t a, int16_t b) -__contract__( - requires(b > -HALF_Q) - requires(b < HALF_Q) - ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&b, 1, HALF_Q); - - res = montgomery_reduce((int32_t)a * (int32_t)b); - /* Bounds: - * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 - * < MLKEM_Q - */ - - debug_assert_abs_bound(&res, 1, MLKEM_Q); - return res; -} - -/************************************************* - * Name: barrett_reduce - * - * Description: Barrett reduction; given a 16-bit integer a, computes - * centered representative congruent to a mod q in - * {-(q-1)/2,...,(q-1)/2} - * - * Arguments: - int16_t a: input integer to be reduced - * - * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. - **************************************************/ -static INLINE int16_t barrett_reduce(int16_t a) -__contract__( - ensures(return_value > -HALF_Q && return_value < HALF_Q) -) -{ - /* - * To divide by MLKEM_Q using Barrett multiplication, the "magic number" - * multiplier is round_to_nearest(2**26/MLKEM_Q) - */ - const int BPOWER = 26; - const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - - /* - * Compute round_to_nearest(a/MLKEM_Q) using the multiplier - * above and shift by BPOWER places. - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; - - /* - * t is in -10 .. +10, so we need 32-bit math to - * evaluate t * MLKEM_Q and the subsequent subtraction - */ - int16_t res = (int16_t)(a - t * MLKEM_Q); - - debug_assert_abs_bound(&res, 1, HALF_Q); - return res; -} - -#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/sampling.c similarity index 73% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.c rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/sampling.c index cbbe4407f..98cbdcb74 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/sampling.c @@ -9,7 +9,7 @@ #include "debug.h" #include "fips202.h" #include "fips202x4.h" -#include "rej_uniform.h" +#include "sampling.h" #include "symmetric.h" /* Static namespacing @@ -18,6 +18,8 @@ * within a single compilation unit. */ #define rej_uniform MLKEM_NAMESPACE(rej_uniform) #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar) +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) /* End of static namespacing */ static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target, @@ -233,9 +235,113 @@ void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) xof_release(&state); } +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) +/* End of static namespacing */ + +/************************************************* + * Name: load32_littleendian + * + * Description: load 4 bytes into a 32-bit integer + * in little-endian order + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x + **************************************************/ +static uint32_t load32_littleendian(const uint8_t x[4]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + r |= (uint32_t)x[3] << 24; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) + { + unsigned j; + uint32_t t = load32_littleendian(buf + 4 * i); + uint32_t d = t & 0x55555555; + d += (t >> 1) & 0x55555555; + + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) + { + const int16_t a = (d >> (4 * j + 0)) & 0x3; + const int16_t b = (d >> (4 * j + 2)) & 0x3; + r->coeffs[8 * i + j] = a - b; + } + } +} + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +/************************************************* + * Name: load24_littleendian + * + * Description: load 3 bytes into a 32-bit integer + * in little-endian order. + * This function is only needed for ML-KEM-512 + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) + **************************************************/ +static uint32_t load24_littleendian(const uint8_t x[3]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 4; i++) + __loop__( + invariant(i <= MLKEM_N / 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) + { + unsigned j; + const uint32_t t = load24_littleendian(buf + 3 * i); + uint32_t d = t & 0x00249249; + d += (t >> 1) & 0x00249249; + d += (t >> 2) & 0x00249249; + + for (j = 0; j < 4; j++) + __loop__( + invariant(i <= MLKEM_N / 4 && j <= 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) + { + const int16_t a = (d >> (6 * j + 0)) & 0x7; + const int16_t b = (d >> (6 * j + 3)) & 0x7; + r->coeffs[4 * i + j] = a - b; + } + } +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ + 3 */ + #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ -#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform) -int empty_cu_rej_uniform; +#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling) +int empty_cu_sampling; #endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/sampling.h similarity index 63% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.h rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/sampling.h index 801287259..cc524e0fc 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/sampling.h @@ -2,8 +2,8 @@ * Copyright (c) 2024 The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 */ -#ifndef REJ_UNIFORM_H -#define REJ_UNIFORM_H +#ifndef SAMPLING_H +#define SAMPLING_H #include #include @@ -11,6 +11,37 @@ #include "common.h" #include "poly.h" +#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) +/************************************************* + * Name: poly_cbd2 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=2 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) +/************************************************* + * Name: poly_cbd3 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=3. + * This function is only needed for ML-KEM-512 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ + #define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4) /************************************************* * Name: poly_rej_uniform_x4 @@ -60,4 +91,4 @@ __contract__( assigns(memory_slice(entry, sizeof(poly))) ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q))); -#endif /* REJ_UNIFORM_H */ +#endif /* SAMPLING_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c index 4ef887c62..987f0dce4 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c @@ -10,7 +10,7 @@ #include "common.h" #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) -#include "ntt.h" +#include "poly.h" /* * Table of zeta values used in the reference NTT and inverse NTT. diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/arith_backend.h index 0543b1bd1..ade31cda1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/arith_backend.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/arith_backend.h @@ -17,7 +17,7 @@ * Keep this _after_ the inclusion of the backend; otherwise, * the sanity checks won't have an effect. */ #if defined(MLKEM_NATIVE_CHECK_APIS) -#include "api.h" +#include "native/api.h" #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.c deleted file mode 100644 index 1e6b7c5d1..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED - -#include -#include "cbd.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) -#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) -/* End of static namespacing */ - -/************************************************* - * Name: load32_littleendian - * - * Description: load 4 bytes into a 32-bit integer - * in little-endian order - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x - **************************************************/ -static uint32_t load32_littleendian(const uint8_t x[4]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - r |= (uint32_t)x[3] << 24; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) - { - unsigned j; - uint32_t t = load32_littleendian(buf + 4 * i); - uint32_t d = t & 0x55555555; - d += (t >> 1) & 0x55555555; - - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) - { - const int16_t a = (d >> (4 * j + 0)) & 0x3; - const int16_t b = (d >> (4 * j + 2)) & 0x3; - r->coeffs[8 * i + j] = a - b; - } - } -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -/************************************************* - * Name: load24_littleendian - * - * Description: load 3 bytes into a 32-bit integer - * in little-endian order. - * This function is only needed for ML-KEM-512 - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) - **************************************************/ -static uint32_t load24_littleendian(const uint8_t x[3]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 4; i++) - __loop__( - invariant(i <= MLKEM_N / 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) - { - unsigned j; - const uint32_t t = load24_littleendian(buf + 3 * i); - uint32_t d = t & 0x00249249; - d += (t >> 1) & 0x00249249; - d += (t >> 2) & 0x00249249; - - for (j = 0; j < 4; j++) - __loop__( - invariant(i <= MLKEM_N / 4 && j <= 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) - { - const int16_t a = (d >> (6 * j + 0)) & 0x7; - const int16_t b = (d >> (6 * j + 3)) & 0x7; - r->coeffs[4 * i + j] = a - b; - } - } -} -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ - 3 */ - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd) -int empty_cu_cbd; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.h deleted file mode 100644 index 54c1f5b90..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef CBD_H -#define CBD_H - -#include -#include "common.h" -#include "poly.h" - -#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) -/************************************************* - * Name: poly_cbd2 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=2 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) -/************************************************* - * Name: poly_cbd3 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=3. - * This function is only needed for ML-KEM-512 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ - -#endif /* CBD_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h index 4f326333e..62ed53ab1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h @@ -15,12 +15,19 @@ #include "sys.h" /* Include backend metadata */ -#if defined(MLKEM_USE_NATIVE) -#if defined(MLKEM_NATIVE_ARITH_BACKEND) -#include MLKEM_NATIVE_ARITH_BACKEND +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#include MLKEM_NATIVE_ARITH_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not. +#endif #endif -#if defined(MLKEM_NATIVE_FIPS202_BACKEND) -#include MLKEM_NATIVE_FIPS202_BACKEND + +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#include MLKEM_NATIVE_FIPS202_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not. #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/compress.c new file mode 100644 index 000000000..a03fe0ac4 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/compress.c @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "common.h" +#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) + +#include +#include +#include "arith_backend.h" +#include "cbmc.h" +#include "compress.h" +#include "debug.h" +#include "verify.h" + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 16))) + { + t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); + } + + r[i * 4] = t[0] | (t[1] << 4); + r[i * 4 + 1] = t[2] | (t[3] << 4); + r[i * 4 + 2] = t[4] | (t[5] << 4); + r[i * 4 + 3] = t[6] | (t[7] << 4); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + for (j = 0; j < MLKEM_N / 4; j++) + __loop__(invariant(j <= MLKEM_N / 4)) + { + unsigned k; + uint16_t t[4]; + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(forall(r, 0, k, t[r] < (1u << 10)))) + { + t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 10-bit in size. + */ + r[5 * j + 0] = (t[0] >> 0) & 0xFF; + r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); + r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); + r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); + r[5 * j + 4] = (t[3] >> 2); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) + { + r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); + r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 4; j++) + __loop__( + invariant(j <= MLKEM_N / 4) + invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[4]; + uint8_t const *base = &a[5 * j]; + + t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); + t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); + t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); + + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) + { + r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 32))) + { + t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); + } + + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC, and use array indexing into + * r rather than pointer-arithmetic to simplify verification + */ + r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); + r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); + r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); + r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); + r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (j = 0; j < MLKEM_N / 8; j++) + __loop__(invariant(j <= MLKEM_N / 8)) + { + unsigned k; + uint16_t t[8]; + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(forall(r, 0, k, t[r] < (1u << 11)))) + { + t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 11-bit in size. + */ + r[11 * j + 0] = (t[0] >> 0) & 0xFF; + r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); + r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); + r[11 * j + 3] = (t[2] >> 2) & 0xFF; + r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); + r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); + r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); + r[11 * j + 7] = (t[5] >> 1) & 0xFF; + r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); + r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); + r[11 * j + 10] = (t[7] >> 3); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + uint8_t t[8]; + const unsigned offset = i * 5; + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC and unwind loop for ease + * of proof. + */ + + /* + * Decompress 5 8-bit bytes (so 40 bits) into + * 8 5-bit values stored in t[] + */ + t[0] = 0x1F & (a[offset + 0] >> 0); + t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); + t[2] = 0x1F & (a[offset + 1] >> 2); + t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); + t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); + t[5] = 0x1F & (a[offset + 3] >> 1); + t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); + t[7] = 0x1F & (a[offset + 4] >> 3); + + /* and copy to the correct slice in r[] */ + for (j = 0; j < 8; j++) + __loop__( + invariant(j <= 8 && i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 8; j++) + __loop__( + invariant(j <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[8]; + uint8_t const *base = &a[11 * j]; + t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); + t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | + ((uint16_t)base[4] << 10)); + t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); + t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); + t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | + ((uint16_t)base[8] << 9)); + t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); + t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); + + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) + { + r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ + +#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 2; i++) + __loop__(invariant(i <= MLKEM_N / 2)) + { + const uint16_t t0 = a->coeffs[2 * i]; + const uint16_t t1 = a->coeffs[2 * i + 1]; + /* + * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of + * significant data, so these can be packed into 24 bits or exactly + * 3 bytes, as follows. + */ + + /* Least significant bits 0 - 7 of t0. */ + r[3 * i + 0] = t0 & 0xFF; + + /* + * Most significant bits 8 - 11 of t0 become the least significant + * nibble of the second byte. The least significant 4 bits + * of t1 become the upper nibble of the second byte. + */ + r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + + /* Bits 4 - 11 of t1 become the third byte. */ + r[3 * i + 2] = t1 >> 4; + } +} +#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + poly_tobytes_native(r, a->coeffs); +} +#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + +#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) + { + const uint8_t t0 = a[3 * i + 0]; + const uint8_t t1 = a[3 * i + 1]; + const uint8_t t2 = a[3 * i + 2]; + r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); + r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); + } + + /* Note that the coefficients are not canonical */ + debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); +} +#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + poly_frombytes_native(r->coeffs, a); +} +#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ + +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +{ + unsigned i; +#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) +#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" +#endif + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + for (j = 0; j < 8; j++) + __loop__( + invariant(i < MLKEM_N / 8 && j <= 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + /* Prevent the compiler from recognizing this as a bit selection */ + uint8_t mask = value_barrier_u8(1u << j); + r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); + } + } + debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + msg[i] = 0; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8)) + { + uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); + msg[i] |= t << j; + } + } +} + +#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ + +#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress) +int empty_cu_compress; + +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/compress.h new file mode 100644 index 000000000..409dbe519 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/compress.h @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef COMPRESS_H +#define COMPRESS_H + +#include +#include +#include "cbmc.h" +#include "common.h" +#include "debug.h" +#include "poly.h" +#include "verify.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) +#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) +#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) +#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) +#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) +#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) +#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) +#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) +#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) +/* End of static namespacing */ + +/************************************************************ + * Name: scalar_compress_d1 + * + * Description: Computes round(u * 2 / q) + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 1. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d1(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 2) + ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) +{ + uint32_t d0 = u << 1; + d0 *= 645083; + d0 += 1u << 30; + d0 >>= 31; + return d0; +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_compress_d4 + * + * Description: Computes round(u * 16 / q) % 16 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d4(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 16) + ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) +{ + uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ + return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d4 + * + * Description: Computes round(u * q / 16) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d4(uint32_t u) +__contract__( + requires(0 <= u && u < 16) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 8) / 16; } + +/************************************************************ + * Name: scalar_compress_d5 + * + * Description: Computes round(u * 32 / q) % 32 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d5(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 32) + ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) +{ + uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ + return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d5 + * + * Description: Computes round(u * q / 32) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo 32 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d5(uint32_t u) +__contract__( + requires(0 <= u && u < 32) + ensures(return_value <= MLKEM_Q - 1) +) { return ((u * MLKEM_Q) + 16) / 32; } + +/************************************************************ + * Name: scalar_compress_d10 + * + * Description: Computes round(u * 2**10 / q) % 2**10 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d10(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 10)) + ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) +{ + uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x3FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d10 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d10(uint32_t u) +__contract__( + requires(0 <= u && u < 1024) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 512) / 1024; } + +/************************************************************ + * Name: scalar_compress_d11 + * + * Description: Computes round(u * 2**11 / q) % 2**11 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 11. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d11(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 11)) + ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) +{ + uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x7FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d11 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d11(uint32_t u) +__contract__( + requires(0 <= u && u < 2048) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 1024) / 2048; } + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ + (MLKEM_K == 2 || MLKEM_K == 3) +#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) +/************************************************* + * Name: poly_compress_d4 + * + * Description: Compression (4 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); + +#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) +/************************************************* + * Name: poly_compress_d10 + * + * Description: Compression (10 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); + +#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) +/************************************************* + * Name: poly_decompress_d4 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); + +#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) +/************************************************* + * Name: poly_decompress_d10 + * + * Description: De-serialization and subsequent decompression (10 bits) of a + * polynomial; approximate inverse of poly_compress_d10 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) +/************************************************* + * Name: poly_compress_d5 + * + * Description: Compression (5 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); + +#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) +/************************************************* + * Name: poly_compress_d11 + * + * Description: Compression (11 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); + +#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) +/************************************************* + * Name: poly_decompress_d5 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); + +#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) +/************************************************* + * Name: poly_decompress_d11 + * + * Description: De-serialization and subsequent decompression (11 bits) of a + * polynomial; approximate inverse of poly_compress_d11 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ + */ + +#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) +/************************************************* + * Name: poly_tobytes + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - a: const pointer to input polynomial, + * with each coefficient in the range [0,1,..,Q-1] + * OUTPUT + * - r: pointer to output byte array + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYBYTES)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(r)) +); + + +#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) +/************************************************* + * Name: poly_frombytes + * + * Description: De-serialization of a polynomial. + * + * Arguments: INPUT + * - a: pointer to input byte array + * (of MLKEM_POLYBYTES bytes) + * OUTPUT + * - r: pointer to output polynomial, with + * each coefficient unsigned and in the range + * 0 .. 4095 + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) +); + + +#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) +/************************************************* + * Name: poly_frommsg + * + * Description: Convert 32-byte message to polynomial + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *msg: pointer to input message + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(object_whole(r)) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) +); + +#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) +/************************************************* + * Name: poly_tomsg + * + * Description: Convert polynomial to 32-byte message + * + * Arguments: - uint8_t *msg: pointer to output message + * - const poly *r: pointer to input polynomial + * Coefficients must be unsigned canonical + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(msg)) +); + +#endif /* COMPRESS_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h index fa89370ce..e975ede95 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h @@ -122,46 +122,87 @@ /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ /****************************************************************************** - * Name: MLKEM_USE_NATIVE + * Name: MLKEM_USE_NATIVE_BACKEND_ARITH * - * Description: Determines whether a native backend should - * be used, if available. + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. * * This can also be set using CFLAGS. * *****************************************************************************/ -#if !defined(MLKEM_USE_NATIVE) -/* #define MLKEM_USE_NATIVE */ +#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */ #endif /****************************************************************************** - * Name: MLKEM_NATIVE_ARITH_BACKEND + * Name: MLKEM_NATIVE_ARITH_BACKEND_FILE * * Description: The arithmetic backend to use. * - * This must be the filename of an arithmetic backend. - * See the existing backends for examples. + * If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND) -#define MLKEM_NATIVE_ARITH_BACKEND "default.h" -#endif /* MLKEM_NATIVE_ARITH_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \ + !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h" +#endif /****************************************************************************** - * Name: MLKEM_NATIVE_FIPS202_BACKEND + * Name: MLKEM_USE_NATIVE_BACKEND_FIPS202 + * + * Description: Determines whether an native FIPS202 backend should be used. + * + * The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is + * the performance bottleneck of SHA3 and SHAKE. + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the FIPS202 backend to be use is + * determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */ +#endif + +/****************************************************************************** + * Name: MLKEM_NATIVE_FIPS202_BACKEND_FILE * * Description: The FIPS-202 backend to use. * - * This must be the filename of an FIPS-202 backend. + * If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must + * either be undefined or the filename of a FIPS202 backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND) -#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h" -#endif /* MLKEM_NATIVE_FIPS202_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \ + !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h" +#endif /************************* Config internals ********************************/ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.c index 0cfcc3e9e..318d0fc77 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.c @@ -9,11 +9,10 @@ #include "fips202.h" #include "fips202x4.h" #include "indcpa.h" -#include "ntt.h" #include "poly.h" -#include "polyvec.h" +#include "poly_k.h" #include "randombytes.h" -#include "rej_uniform.h" +#include "sampling.h" #include "symmetric.h" #include "arith_backend.h" @@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v, #define poly_permute_bitrev_to_custom \ MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom) -static INLINE void poly_permute_bitrev_to_custom(poly *data) +static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N]) __contract__( /* We don't specify that this should be a permutation, but only * that it does not change the bound established at the end of gen_matrix. */ - requires(memory_no_alias(data, sizeof(poly))) - requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N)) + requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q)) assigns(memory_slice(data, sizeof(poly))) - ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } + ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ /* Not static for benchmarking */ @@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed) { for (j = 0; j < MLKEM_K; j++) { - poly_permute_bitrev_to_custom(&a[i].vec[j]); + poly_permute_bitrev_to_custom(a[i].vec[j].coeffs); } } } diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.h index 2c4fda3c4..b4d5985bf 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.h @@ -8,7 +8,7 @@ #include #include "cbmc.h" #include "common.h" -#include "polyvec.h" +#include "poly_k.h" #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix) /************************************************* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/README.md similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/README.md rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/README.md diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/clean.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/clean.h similarity index 90% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/clean.h rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/clean.h index 43a401dfc..f124702a4 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/clean.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/clean.h @@ -19,6 +19,6 @@ /* Filename of the C backend implementation. * This is not inlined here because this header is included in assembly * files as well. */ -#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/clean_impl.h" +#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/clean_impl.h" #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/opt.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/opt.h similarity index 91% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/opt.h rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/opt.h index 04323c3e7..a7217163f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/opt.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/opt.h @@ -19,6 +19,6 @@ /* Filename of the C backend implementation. * This is not inlined here because this header is included in assembly * files as well. */ -#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/opt_impl.h" +#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/opt_impl.h" #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c index 1e189fd99..b3a6f198f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c @@ -8,7 +8,7 @@ * Do not modify it directly. */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/arith_native_aarch64.h similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/arith_native_aarch64.h rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/arith_native_aarch64.h index fc4e7dd38..a784a3027 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/arith_native_aarch64.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/arith_native_aarch64.h @@ -6,7 +6,7 @@ #define MLKEM_AARCH64_NATIVE_H #include -#include "common.h" +#include "../../../common.h" #define aarch64_ntt_zetas_layer01234 \ MLKEM_NAMESPACE(aarch64_ntt_zetas_layer01234) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/clean_impl.h similarity index 58% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/clean_impl.h rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/clean_impl.h index 548b1eebb..ded7d067a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/clean_impl.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/clean_impl.h @@ -12,9 +12,6 @@ #include "arith_native_aarch64.h" -#include "poly.h" -#include "polyvec.h" - /* Set of primitives that this backend replaces */ #define MLKEM_USE_NATIVE_NTT #define MLKEM_USE_NATIVE_INTT @@ -25,45 +22,46 @@ #define MLKEM_USE_NATIVE_POLY_TOBYTES #define MLKEM_USE_NATIVE_REJ_UNIFORM -static INLINE void ntt_native(poly *data) +static INLINE void ntt_native(int16_t data[MLKEM_N]) { - ntt_asm_clean(data->coeffs, aarch64_ntt_zetas_layer01234, - aarch64_ntt_zetas_layer56); + ntt_asm_clean(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56); } -static INLINE void intt_native(poly *data) +static INLINE void intt_native(int16_t data[MLKEM_N]) { - intt_asm_clean(data->coeffs, aarch64_invntt_zetas_layer01234, + intt_asm_clean(data, aarch64_invntt_zetas_layer01234, aarch64_invntt_zetas_layer56); } -static INLINE void poly_reduce_native(poly *data) +static INLINE void poly_reduce_native(int16_t data[MLKEM_N]) { - poly_reduce_asm_clean(data->coeffs); + poly_reduce_asm_clean(data); } -static INLINE void poly_tomont_native(poly *data) + +static INLINE void poly_tomont_native(int16_t data[MLKEM_N]) { - poly_tomont_asm_clean(data->coeffs); + poly_tomont_asm_clean(data); } -static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y) +static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2], + const int16_t y[MLKEM_N]) { - poly_mulcache_compute_asm_clean(x->coeffs, y->coeffs, - aarch64_zetas_mulcache_native, + poly_mulcache_compute_asm_clean(x, y, aarch64_zetas_mulcache_native, aarch64_zetas_mulcache_twisted_native); } + static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache) + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]) { - polyvec_basemul_acc_montgomery_cached_asm_clean( - r->coeffs, a->vec[0].coeffs, b->vec[0].coeffs, b_cache->vec[0].coeffs); + polyvec_basemul_acc_montgomery_cached_asm_clean(r, a, b, b_cache); } static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a) + const int16_t a[MLKEM_N]) { - poly_tobytes_asm_clean(r, a->coeffs); + poly_tobytes_asm_clean(r, a); } static INLINE int rej_uniform_native(int16_t *r, unsigned int len, diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/consts.h similarity index 94% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/consts.h rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/consts.h index c40947299..e3ea26a27 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/consts.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/consts.h @@ -7,7 +7,7 @@ #define MLKEM_NATIVE_AARCH64_CONSTS #include -#include "common.h" +#include "../../../common.h" #define zetas_mulcache_native MLKEM_NAMESPACE(zetas_mulcache_native) extern const int16_t zetas_mulcache_native[256]; diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S index b243a569d..28ad38975 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S @@ -23,7 +23,7 @@ /// SOFTWARE. /// -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) // Bounds: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S index c94746e17..857c729cb 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S @@ -23,7 +23,7 @@ /// SOFTWARE. /// -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) // Bounds: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S index cd63cc4d6..30fdc76b0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S @@ -24,7 +24,7 @@ /// SOFTWARE. /// -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) // Bounds: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_opt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_opt.S index 8705615b7..431f9dc6f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_opt.S @@ -24,7 +24,7 @@ /// SOFTWARE. /// -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) // Bounds: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/opt_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/opt_impl.h similarity index 58% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/opt_impl.h rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/opt_impl.h index ec1bf6587..eb8e39ed0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/opt_impl.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/opt_impl.h @@ -10,11 +10,9 @@ #else #define MLKEM_NATIVE_ARITH_PROFILE_IMPL_H +#include "../../../params.h" #include "arith_native_aarch64.h" -#include "poly.h" -#include "polyvec.h" - /* Set of primitives that this backend replaces */ #define MLKEM_USE_NATIVE_NTT #define MLKEM_USE_NATIVE_INTT @@ -25,45 +23,46 @@ #define MLKEM_USE_NATIVE_POLY_TOBYTES #define MLKEM_USE_NATIVE_REJ_UNIFORM -static INLINE void ntt_native(poly *data) +static INLINE void ntt_native(int16_t data[MLKEM_N]) { - ntt_asm_opt(data->coeffs, aarch64_ntt_zetas_layer01234, - aarch64_ntt_zetas_layer56); + ntt_asm_opt(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56); } -static INLINE void intt_native(poly *data) +static INLINE void intt_native(int16_t data[MLKEM_N]) { - intt_asm_opt(data->coeffs, aarch64_invntt_zetas_layer01234, + intt_asm_opt(data, aarch64_invntt_zetas_layer01234, aarch64_invntt_zetas_layer56); } -static INLINE void poly_reduce_native(poly *data) +static INLINE void poly_reduce_native(int16_t data[MLKEM_N]) { - poly_reduce_asm_opt(data->coeffs); + poly_reduce_asm_opt(data); } -static INLINE void poly_tomont_native(poly *data) + +static INLINE void poly_tomont_native(int16_t data[MLKEM_N]) { - poly_tomont_asm_opt(data->coeffs); + poly_tomont_asm_opt(data); } -static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y) +static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2], + const int16_t y[MLKEM_N]) { - poly_mulcache_compute_asm_opt(x->coeffs, y->coeffs, - aarch64_zetas_mulcache_native, + poly_mulcache_compute_asm_opt(x, y, aarch64_zetas_mulcache_native, aarch64_zetas_mulcache_twisted_native); } + static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache) + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]) { - polyvec_basemul_acc_montgomery_cached_asm_opt( - r->coeffs, a->vec[0].coeffs, b->vec[0].coeffs, b_cache->vec[0].coeffs); + polyvec_basemul_acc_montgomery_cached_asm_opt(r, a, b, b_cache); } static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a) + const int16_t a[MLKEM_N]) { - poly_tobytes_asm_opt(r, a->coeffs); + poly_tobytes_asm_opt(r, a); } static INLINE int rej_uniform_native(int16_t *r, unsigned int len, diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/optimize.sh b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/optimize.sh similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/optimize.sh rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/optimize.sh diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_clean.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_clean.S index 809f9667e..f3ee0796f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_clean.S @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) /* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S index 815a9dd1a..555c60a67 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) /* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S index c91675b44..0b6df6345 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S @@ -9,7 +9,7 @@ // https://eprint.iacr.org/2021/986 // https://github.com/neon-ntt/neon-ntt -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) // Input: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S index 8300b682c..7a27fda3e 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S @@ -9,7 +9,7 @@ // https://eprint.iacr.org/2021/986 // https://github.com/neon-ntt/neon-ntt -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) // Input: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S index 5151a05d0..9158d6c82 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S @@ -18,7 +18,7 @@ * * Returns number of sampled 16-bit integers (at most MLKEM_N). **************************************************/ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_table.c rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c index 507660349..29cdbe95f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_table.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c @@ -8,7 +8,7 @@ * Do not modify it directly. */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/api.h similarity index 90% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/api.h rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/api.h index 792ecb8a4..0704f9dcd 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/api.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/api.h @@ -23,8 +23,7 @@ #define MLKEM_NATIVE_ARITH_NATIVE_API_H #include -#include "poly.h" -#include "polyvec.h" +#include "../common.h" /* * This is the C<->native interface allowing for the drop-in of @@ -65,9 +64,9 @@ * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER * for more information. * - * Arguments: - poly *p: pointer to in/output polynomial + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial **************************************************/ -static INLINE void ntt_native(poly *); +static INLINE void ntt_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_NTT */ #if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER) @@ -96,10 +95,10 @@ and to/from bytes conversions." * * This must only be defined if there is native code for * all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache. - * Arguments: - poly *p: pointer to in/output polynomial + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial * **************************************************/ -static INLINE void poly_permute_bitrev_to_custom(poly *); +static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ #if defined(MLKEM_USE_NATIVE_INTT) @@ -117,7 +116,7 @@ static INLINE void poly_permute_bitrev_to_custom(poly *); * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -static INLINE void intt_native(poly *); +static INLINE void intt_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_INTT */ #if defined(MLKEM_USE_NATIVE_POLY_REDUCE) @@ -126,9 +125,9 @@ static INLINE void intt_native(poly *); * * Description: Applies modular reduction to all coefficients of a polynomial. * - * Arguments: - poly *r: pointer to input/output polynomial + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial **************************************************/ -static INLINE void poly_reduce_native(poly *); +static INLINE void poly_reduce_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ #if defined(MLKEM_USE_NATIVE_POLY_TOMONT) @@ -138,9 +137,9 @@ static INLINE void poly_reduce_native(poly *); * Description: Inplace conversion of all coefficients of a polynomial * from normal domain to Montgomery domain * - * Arguments: - poly *r: pointer to input/output polynomial + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial **************************************************/ -static INLINE void poly_tomont_native(poly *); +static INLINE void poly_tomont_native(int16_t p[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ #if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) @@ -165,8 +164,8 @@ static INLINE void poly_tomont_native(poly *); * OUTPUT * - cache: pointer to multiplication cache **************************************************/ -static INLINE void poly_mulcache_compute_native(poly_mulcache *cache, - const poly *poly); +static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2], + const int16_t poly[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ #if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) @@ -189,8 +188,9 @@ static INLINE void poly_mulcache_compute_native(poly_mulcache *cache, * in NTT domain, and of the same order as a and b. **************************************************/ static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache); + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]); #endif #if defined(MLKEM_USE_NATIVE_POLY_TOBYTES) @@ -209,7 +209,7 @@ static INLINE void polyvec_basemul_acc_montgomery_cached_native( * (of MLKEM_POLYBYTES bytes) **************************************************/ static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a); + const int16_t a[MLKEM_N]); #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ #if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) @@ -226,7 +226,7 @@ static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], * - a: const pointer to input byte aray * (of MLKEM_POLYBYTES bytes) **************************************************/ -static INLINE void poly_frombytes_native(poly *a, +static INLINE void poly_frombytes_native(int16_t a[MLKEM_N], const uint8_t r[MLKEM_POLYBYTES]); #endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/default.h similarity index 97% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/default.h rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/default.h index d1e41c52e..f9fe4310a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/default.h @@ -8,7 +8,7 @@ /* * Default arithmetic backend */ -#include "sys.h" +#include "../sys.h" #ifdef SYS_AARCH64 /* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.c deleted file mode 100644 index 3651c8da9..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) - -#include -#include "arith_backend.h" -#include "debug.h" -#include "ntt.h" -#include "reduce.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) -#define ntt_layer MLKEM_NAMESPACE(ntt_layer) -#define invntt_layer MLKEM_NAMESPACE(invntt_layer) -/* End of static namespacing */ - -#if !defined(MLKEM_USE_NATIVE_NTT) -/* - * Computes a block CT butterflies with a fixed twiddle factor, - * using Montgomery multiplication. - * Parameters: - * - r: Pointer to base of polynomial (_not_ the base of butterfly block) - * - root: Twiddle factor to use for the butterfly. This must be in - * Montgomery form and signed canonical. - * - start: Offset to the beginning of the butterfly block - * - len: Index difference between coefficients subject to a butterfly - * - bound: Ghost variable describing coefficient bound: Prior to `start`, - * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, - * they must be bound by `bound`. - * When this function returns, output coefficients in the index range - * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. - * Example: - * - start=8, len=4 - * This would compute the following four butterflies - * 8 -- 12 - * 9 -- 13 - * 10 -- 14 - * 11 -- 15 - * - start=4, len=2 - * This would compute the following two butterflies - * 4 -- 6 - * 5 -- 7 - */ -static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, - unsigned start, unsigned len, int bound) -__contract__( - requires(start < MLKEM_N) - requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) - requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) - requires(-HALF_Q < zeta && zeta < HALF_Q) - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) - requires(array_abs_bound(r, start, MLKEM_N, bound)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) - ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) -{ - /* `bound` is a ghost variable only needed in the CBMC specification */ - unsigned j; - ((void)bound); - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - /* - * Coefficients are updated in strided pairs, so the bounds for the - * intermediate states alternate twice between the old and new bound - */ - invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j, start + len, bound)) - invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) - { - int16_t t; - t = fqmul(r[j + len], zeta); - r[j + len] = r[j] - t; - r[j] = r[j] + t; - } -} - -/* - *Compute one layer of forward NTT - * Parameters: - * - r: Pointer to base of polynomial - * - len: Stride of butterflies in this layer. - * - layer: Ghost variable indicating which layer is being applied. - * Must match `len` via `len == MLKEM_N >> layer`. - * Note: `len` could be dropped and computed in the function, but - * we are following the structure of the reference NTT from the - * official Kyber implementation here, merely adding `layer` as - * a ghost variable for the specifications. - */ -static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) - requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable only needed in the CBMC specification */ - ((void)layer); - /* Twiddle factors for layer n start at index 2^(layer-1) */ - k = MLKEM_N / (2 * len); - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(start < MLKEM_N + 2 * len) - invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) - invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) - invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) - { - int16_t zeta = zetas[k++]; - ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); - } -} - -/* - * Compute full forward NTT - * NOTE: This particular implementation satisfies a much tighter - * bound on the output coefficients (5*q) than the contractual one (8*q), - * but this is not needed in the calling code. Should we change the - * base multiplication strategy to require smaller NTT output bounds, - * the proof may need strengthening. - */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - unsigned len, layer; - int16_t *r; - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - r = p->coeffs; - - for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) - __loop__( - invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) - invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) - { - ntt_layer(r, len, layer); - } - - /* Check the stronger bound */ - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_NTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - ntt_native(p); - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_NTT */ - -#if !defined(MLKEM_USE_NATIVE_INTT) - -/* Compute one layer of inverse NTT */ -static void invntt_layer(int16_t *r, unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) - requires(len == (1 << (8 - layer))) - requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable used only in the specification */ - ((void)layer); - k = MLKEM_N / len - 1; - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - invariant(start <= MLKEM_N && k <= 127) - /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ - invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) - { - unsigned j; - int16_t zeta = zetas[k--]; - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - invariant(start <= MLKEM_N && k <= 127) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - int16_t t = r[j]; - r[j] = barrett_reduce(t + r[j + len]); - r[j + len] = r[j + len] - t; - r[j + len] = fqmul(r[j + len], zeta); - } - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - /* - * Scale input polynomial to account for Montgomery factor - * and NTT twist. This also brings coefficients down to - * absolute value < MLKEM_Q. - */ - unsigned j, len, layer; - const int16_t f = 1441; - int16_t *r = p->coeffs; - - for (j = 0; j < MLKEM_N; j++) - __loop__( - invariant(j <= MLKEM_N) - invariant(array_abs_bound(r, 0, j, MLKEM_Q))) - { - r[j] = fqmul(r[j], f); - } - - /* Run the invNTT layers */ - for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) - __loop__( - invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - invntt_layer(p->coeffs, len, layer); - } - - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - intt_native(p); - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -{ - int32_t t0, t1; - debug_assert_bound(a, 2, 0, UINT12_LIMIT); - - t0 = (int32_t)a[1] * b_cached; - t0 += (int32_t)a[0] * b[0]; - t1 = (int32_t)a[0] * b[1]; - t1 += (int32_t)a[1] * b[0]; - - /* |ti| < 2 * q * 2^15 */ - r[0] = montgomery_reduce(t0); - r[1] = montgomery_reduce(t1); - - debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); -} - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt) -int empty_cu_ntt; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.h deleted file mode 100644 index 4e80d3ab3..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef NTT_H -#define NTT_H -#include "common.h" - -#include -#include "cbmc.h" -#include "poly.h" -#include "reduce.h" - -#define zetas MLKEM_NAMESPACE(zetas) -extern const int16_t zetas[128]; - -#define poly_ntt MLKEM_NAMESPACE(poly_ntt) -/************************************************* - * Name: poly_ntt - * - * Description: Computes negacyclic number-theoretic transform (NTT) of - * a polynomial in place. - * - * The input is assumed to be in normal order and - * coefficient-wise bound by MLKEM_Q in absolute value. - * - * The output polynomial is in bitreversed order, and - * coefficient-wise bound by NTT_BOUND in absolute value. - * - * (NOTE: Sometimes the input to the NTT is actually smaller, - * which gives better bounds.) - * - * Arguments: - poly *p: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) -); - -#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) -/************************************************* - * Name: poly_invntt_tomont - * - * Description: Computes inverse of negacyclic number-theoretic transform (NTT) - * of a polynomial in place; - * inputs assumed to be in bitreversed order, output in normal - * order - * - * The input is assumed to be in bitreversed order, and can - * have arbitrary coefficients in int16_t. - * - * The output polynomial is in normal order, and - * coefficient-wise bound by INVNTT_BOUND in absolute value. - * - * Arguments: - uint16_t *a: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) -); - -#define basemul_cached MLKEM_NAMESPACE(basemul_cached) -/************************************************************ - * Name: basemul_cached - * - * Description: Computes a representative modulo q of - * (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536 - * - * If b_cached is b1*zeta, this represents the - * product of (a0 + a1*X) and (b0 + b1*X) in - * Fq[X]/(X^2 - zeta). - * - * Arguments: - r: Pointer to output polynomial - * Upon return, coefficients are bound by - * 2*MLKEM_Q in absolute value. - * - a: Pointer to first input polynomial - * Every coefficient must be in [0..4095] - * - b: Pointer to second input polynomial - * Can have arbitrary int16_t coefficients - * - b_cached: Some precomputed value, typically derived from - * b1 and a twiddle factor. Can be an arbitary int16_t. - ************************************************************/ -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -__contract__( - requires(memory_no_alias(r, 2 * sizeof(int16_t))) - requires(memory_no_alias(a, 2 * sizeof(int16_t))) - requires(memory_no_alias(b, 2 * sizeof(int16_t))) - requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) - assigns(memory_slice(r, 2 * sizeof(int16_t))) - ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)) -); - -#endif /* NTT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/params.h index 57ea4c8ba..7f6c12625 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/params.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/params.h @@ -18,6 +18,7 @@ #define MLKEM_N 256 #define MLKEM_Q 3329 #define UINT12_LIMIT 4096 +#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define MLKEM_SSBYTES 32 /* size in bytes of shared key */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.c index 7483ebf6d..e8a2e2c6e 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.c @@ -8,388 +8,246 @@ #include #include #include "arith_backend.h" -#include "cbd.h" #include "cbmc.h" #include "debug.h" #include "fips202x4.h" -#include "ntt.h" #include "poly.h" -#include "reduce.h" +#include "sampling.h" #include "symmetric.h" #include "verify.h" -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 16))) - { - t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); - } - - r[i * 4] = t[0] | (t[1] << 4); - r[i * 4 + 1] = t[2] | (t[3] << 4); - r[i * 4 + 2] = t[4] | (t[5] << 4); - r[i * 4 + 3] = t[6] | (t[7] << 4); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - for (j = 0; j < MLKEM_N / 4; j++) - __loop__(invariant(j <= MLKEM_N / 4)) - { - unsigned k; - uint16_t t[4]; - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(forall(r, 0, k, t[r] < (1u << 10)))) - { - t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); - } - - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 10-bit in size. - */ - r[5 * j + 0] = (t[0] >> 0) & 0xFF; - r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); - r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); - r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); - r[5 * j + 4] = (t[3] >> 2); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) - { - r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); - r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) +#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) +#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) +#define fqmul MLKEM_NAMESPACE(fqmul) +#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) +#define basemul_cached MLKEM_NAMESPACE(basemul_cached) +#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) +#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) +#define ntt_layer MLKEM_NAMESPACE(ntt_layer) +#define invntt_layer MLKEM_NAMESPACE(invntt_layer) +/* End of static namespacing */ + +/************************************************* + * Name: cast_uint16_to_int16 + * + * Description: Cast uint16 value to int16 + * + * Returns: + * input x in 0 .. 32767: returns value unchanged + * input x in 32768 .. 65535: returns (x - 65536) + **************************************************/ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "conversion" +#endif +ALWAYS_INLINE +static INLINE int16_t cast_uint16_to_int16(uint16_t x) { - unsigned j; - for (j = 0; j < MLKEM_N / 4; j++) - __loop__( - invariant(j <= MLKEM_N / 4) - invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[4]; - uint8_t const *base = &a[5 * j]; - - t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); - t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); - t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); - - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) - { - r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + /* + * PORTABILITY: This relies on uint16_t -> int16_t + * being implemented as the inverse of int16_t -> uint16_t, + * which is implementation-defined (C99 6.3.1.3 (3)) + * CBMC (correctly) fails to prove this conversion is OK, + * so we have to suppress that check here + */ + return (int16_t)x; } -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ +#ifdef CBMC +#pragma CPROVER check pop +#endif -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +/************************************************* + * Name: montgomery_reduce_generic + * + * Description: Generic Montgomery reduction; given a 32-bit integer a, computes + * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 + * + * Arguments: - int32_t a: input integer to be reduced + * + * Returns: integer congruent to a * R^-1 modulo q, with absolute value + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 + * + **************************************************/ +ALWAYS_INLINE +static INLINE int16_t montgomery_reduce_generic(int32_t a) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ + const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 32))) - { - t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); - } + /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ + const uint16_t a_reduced = a & UINT16_MAX; + const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC, and use array indexing into - * r rather than pointer-arithmetic to simplify verification - */ - r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); - r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); - r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); - r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); - r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); - } -} + /* Lift to signed canonical representative mod 2^16. */ + const int16_t t = cast_uint16_to_int16(a_inverted); -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + int32_t r = a - ((int32_t)t * MLKEM_Q); + /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - for (j = 0; j < MLKEM_N / 8; j++) - __loop__(invariant(j <= MLKEM_N / 8)) - { - unsigned k; - uint16_t t[8]; - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(forall(r, 0, k, t[r] < (1u << 11)))) - { - t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); - } + /* + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + r = r >> 16; + /* Bounds: |r >> 16| <= ceil(|r| / 2^16) + * <= ceil(|a| / 2^16 + MLKEM_Q / 2) + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * + * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) + */ - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 11-bit in size. - */ - r[11 * j + 0] = (t[0] >> 0) & 0xFF; - r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); - r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); - r[11 * j + 3] = (t[2] >> 2) & 0xFF; - r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); - r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); - r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); - r[11 * j + 7] = (t[5] >> 1) & 0xFF; - r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); - r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); - r[11 * j + 10] = (t[7] >> 3); - } + return (int16_t)r; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +/************************************************* + * Name: montgomery_reduce + * + * Description: Montgomery reduction + * + * Arguments: - int32_t a: input integer to be reduced + * Must be smaller than 2 * 2^12 * 2^15 in absolute value. + * + * Returns: integer congruent to a * R^-1 modulo q, + * smaller than 2 * q in absolute value. + **************************************************/ +static INLINE int16_t montgomery_reduce(int32_t a) +__contract__( + requires(a > -(2 * UINT12_LIMIT * 32768)) + requires(a < (2 * UINT12_LIMIT * 32768)) + ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) +) { - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - uint8_t t[8]; - const unsigned offset = i * 5; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC and unwind loop for ease - * of proof. - */ - - /* - * Decompress 5 8-bit bytes (so 40 bits) into - * 8 5-bit values stored in t[] - */ - t[0] = 0x1F & (a[offset + 0] >> 0); - t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); - t[2] = 0x1F & (a[offset + 1] >> 2); - t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); - t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); - t[5] = 0x1F & (a[offset + 3] >> 1); - t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); - t[7] = 0x1F & (a[offset + 4] >> 3); - - /* and copy to the correct slice in r[] */ - for (j = 0; j < 8; j++) - __loop__( - invariant(j <= 8 && i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + int16_t res; + debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); + + res = montgomery_reduce_generic(a); + /* Bounds: + * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 + * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 + * < 2 * MLKEM_Q */ + + debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); + return res; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: fqmul + * + * Description: Montgomery multiplication modulo q=3329 + * + * Arguments: - int16_t a: first factor + * Can be any int16_t. + * - int16_t b: second factor. + * Must be signed canonical (abs value <(q+1)/2) + * + * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and + * smaller than q in absolute value. + * + **************************************************/ +static INLINE int16_t fqmul(int16_t a, int16_t b) +__contract__( + requires(b > -HALF_Q) + requires(b < HALF_Q) + ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) +) { - unsigned j; - for (j = 0; j < MLKEM_N / 8; j++) - __loop__( - invariant(j <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[8]; - uint8_t const *base = &a[11 * j]; - t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); - t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | - ((uint16_t)base[4] << 10)); - t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); - t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); - t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | - ((uint16_t)base[8] << 9)); - t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); - t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); - - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) - { - r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); - } - } + int16_t res; + debug_assert_abs_bound(&b, 1, HALF_Q); + + res = montgomery_reduce((int32_t)a * (int32_t)b); + /* Bounds: + * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 + * < MLKEM_Q + */ - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + debug_assert_abs_bound(&res, 1, MLKEM_Q); + return res; } -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ - -#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: barrett_reduce + * + * Description: Barrett reduction; given a 16-bit integer a, computes + * centered representative congruent to a mod q in + * {-(q-1)/2,...,(q-1)/2} + * + * Arguments: - int16_t a: input integer to be reduced + * + * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. + **************************************************/ +static INLINE int16_t barrett_reduce(int16_t a) +__contract__( + ensures(return_value > -HALF_Q && return_value < HALF_Q) +) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 2; i++) - __loop__(invariant(i <= MLKEM_N / 2)) - { - const uint16_t t0 = a->coeffs[2 * i]; - const uint16_t t1 = a->coeffs[2 * i + 1]; - /* - * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of - * significant data, so these can be packed into 24 bits or exactly - * 3 bytes, as follows. - */ - - /* Least significant bits 0 - 7 of t0. */ - r[3 * i + 0] = t0 & 0xFF; - - /* - * Most significant bits 8 - 11 of t0 become the least significant - * nibble of the second byte. The least significant 4 bits - * of t1 become the upper nibble of the second byte. - */ - r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + /* + * To divide by MLKEM_Q using Barrett multiplication, the "magic number" + * multiplier is round_to_nearest(2**26/MLKEM_Q) + */ + const int BPOWER = 26; + const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - /* Bits 4 - 11 of t1 become the third byte. */ - r[3 * i + 2] = t1 >> 4; - } -} -#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -{ - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - poly_tobytes_native(r, a); -} -#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + /* + * Compute round_to_nearest(a/MLKEM_Q) using the multiplier + * above and shift by BPOWER places. + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; -#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) - { - const uint8_t t0 = a[3 * i + 0]; - const uint8_t t1 = a[3 * i + 1]; - const uint8_t t2 = a[3 * i + 2]; - r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); - r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); - } + /* + * t is in -10 .. +10, so we need 32-bit math to + * evaluate t * MLKEM_Q and the subsequent subtraction + */ + int16_t res = (int16_t)(a - t * MLKEM_Q); - /* Note that the coefficients are not canonical */ - debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); -} -#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - poly_frombytes_native(r, a); + debug_assert_abs_bound(&res, 1, HALF_Q); + return res; } -#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ - -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], + int16_t b_cached) +__contract__( + requires(memory_no_alias(r, 2 * sizeof(int16_t))) + requires(memory_no_alias(a, 2 * sizeof(int16_t))) + requires(memory_no_alias(b, 2 * sizeof(int16_t))) + requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) + assigns(memory_slice(r, 2 * sizeof(int16_t))) + ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))) { - unsigned i; -#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) -#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" -#endif + int32_t t0, t1; + debug_assert_bound(a, 2, 0, UINT12_LIMIT); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - for (j = 0; j < 8; j++) - __loop__( - invariant(i < MLKEM_N / 8 && j <= 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - /* Prevent the compiler from recognizing this as a bit selection */ - uint8_t mask = value_barrier_u8(1u << j); - r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); - } - } - debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); -} + t0 = (int32_t)a[1] * b_cached; + t0 += (int32_t)a[0] * b[0]; + t1 = (int32_t)a[0] * b[1]; + t1 += (int32_t)a[1] * b[0]; -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* |ti| < 2 * q * 2^15 */ + r[0] = montgomery_reduce(t0); + r[1] = montgomery_reduce(t1); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - msg[i] = 0; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8)) - { - uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); - msg[i] |= t << j; - } - } + debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); } MLKEM_NATIVE_INTERNAL_API @@ -434,12 +292,46 @@ void poly_tomont(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_tomont(poly *r) { - poly_tomont_native(r); + poly_tomont_native(r->coeffs); debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) +/************************************************************ + * Name: scalar_signed_to_unsigned_q + * + * Description: converts signed polynomial coefficient + * from signed (-3328 .. 3328) form to + * unsigned form (0 .. 3328). + * + * Note: Cryptographic constant time implementation + * + * Examples: 0 -> 0 + * 1 -> 1 + * 3328 -> 3328 + * -1 -> 3328 + * -2 -> 3327 + * -3328 -> 1 + * + * Arguments: c: signed coefficient to be converted + ************************************************************/ +static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) +__contract__( + requires(c > -MLKEM_Q && c < MLKEM_Q) + ensures(return_value >= 0 && return_value < MLKEM_Q) + ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) +{ + debug_assert_abs_bound(&c, 1, MLKEM_Q); + + /* Add Q if c is negative, but in constant time */ + c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); + + /* and therefore cast to uint16_t is safe. */ + debug_assert_bound(&c, 1, 0, MLKEM_Q); + return (uint16_t)c; +} + MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { @@ -461,7 +353,7 @@ void poly_reduce(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { - poly_reduce_native(r); + poly_reduce_native(r->coeffs); debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ @@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a) MLKEM_NATIVE_INTERNAL_API void poly_mulcache_compute(poly_mulcache *x, const poly *a) { - poly_mulcache_compute_native(x, a); + poly_mulcache_compute_native(x->coeffs, a->coeffs); /* Omitting bounds assertion since native implementations may * decide not to use a mulcache. Note that the C backend implementation * of poly_basemul_montgomery_cached() does still include the check. */ } #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ +#if !defined(MLKEM_USE_NATIVE_NTT) +/* + * Computes a block CT butterflies with a fixed twiddle factor, + * using Montgomery multiplication. + * Parameters: + * - r: Pointer to base of polynomial (_not_ the base of butterfly block) + * - root: Twiddle factor to use for the butterfly. This must be in + * Montgomery form and signed canonical. + * - start: Offset to the beginning of the butterfly block + * - len: Index difference between coefficients subject to a butterfly + * - bound: Ghost variable describing coefficient bound: Prior to `start`, + * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, + * they must be bound by `bound`. + * When this function returns, output coefficients in the index range + * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. + * Example: + * - start=8, len=4 + * This would compute the following four butterflies + * 8 -- 12 + * 9 -- 13 + * 10 -- 14 + * 11 -- 15 + * - start=4, len=2 + * This would compute the following two butterflies + * 4 -- 6 + * 5 -- 7 + */ +static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, + unsigned start, unsigned len, int bound) +__contract__( + requires(start < MLKEM_N) + requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) + requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) + requires(-HALF_Q < zeta && zeta < HALF_Q) + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) + requires(array_abs_bound(r, start, MLKEM_N, bound)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) + ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) +{ + /* `bound` is a ghost variable only needed in the CBMC specification */ + unsigned j; + ((void)bound); + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + /* + * Coefficients are updated in strided pairs, so the bounds for the + * intermediate states alternate twice between the old and new bound + */ + invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j, start + len, bound)) + invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) + { + int16_t t; + t = fqmul(r[j + len], zeta); + r[j + len] = r[j] - t; + r[j] = r[j] + t; + } +} + +/* + *Compute one layer of forward NTT + * Parameters: + * - r: Pointer to base of polynomial + * - len: Stride of butterflies in this layer. + * - layer: Ghost variable indicating which layer is being applied. + * Must match `len` via `len == MLKEM_N >> layer`. + * Note: `len` could be dropped and computed in the function, but + * we are following the structure of the reference NTT from the + * official Kyber implementation here, merely adding `layer` as + * a ghost variable for the specifications. + */ +static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) + requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable only needed in the CBMC specification */ + ((void)layer); + /* Twiddle factors for layer n start at index 2^(layer-1) */ + k = MLKEM_N / (2 * len); + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(start < MLKEM_N + 2 * len) + invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) + invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) + invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) + { + int16_t zeta = zetas[k++]; + ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); + } +} + +/* + * Compute full forward NTT + * NOTE: This particular implementation satisfies a much tighter + * bound on the output coefficients (5*q) than the contractual one (8*q), + * but this is not needed in the calling code. Should we change the + * base multiplication strategy to require smaller NTT output bounds, + * the proof may need strengthening. + */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + unsigned len, layer; + int16_t *r; + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + r = p->coeffs; + + for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) + __loop__( + invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) + invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) + { + ntt_layer(r, len, layer); + } + + /* Check the stronger bound */ + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_NTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + ntt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_NTT */ + +#if !defined(MLKEM_USE_NATIVE_INTT) + +/* Compute one layer of inverse NTT */ +static void invntt_layer(int16_t *r, unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) + requires(len == (1 << (8 - layer))) + requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable used only in the specification */ + ((void)layer); + k = MLKEM_N / len - 1; + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + invariant(start <= MLKEM_N && k <= 127) + /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ + invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) + { + unsigned j; + int16_t zeta = zetas[k--]; + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + invariant(start <= MLKEM_N && k <= 127) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + int16_t t = r[j]; + r[j] = barrett_reduce(t + r[j + len]); + r[j + len] = r[j + len] - t; + r[j + len] = fqmul(r[j + len], zeta); + } + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + /* + * Scale input polynomial to account for Montgomery factor + * and NTT twist. This also brings coefficients down to + * absolute value < MLKEM_Q. + */ + unsigned j, len, layer; + const int16_t f = 1441; + int16_t *r = p->coeffs; + + for (j = 0; j < MLKEM_N; j++) + __loop__( + invariant(j <= MLKEM_N) + invariant(array_abs_bound(r, 0, j, MLKEM_Q))) + { + r[j] = fqmul(r[j], f); + } + + /* Run the invNTT layers */ + for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) + __loop__( + invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + invntt_layer(p->coeffs, len, layer); + } + + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_INTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + intt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_INTT */ + #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.h index 6a14c785d..cb0d67c1a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.h @@ -9,7 +9,7 @@ #include #include "cbmc.h" #include "common.h" -#include "reduce.h" +#include "debug.h" #include "verify.h" /* Absolute exclusive upper bound for the output of the inverse NTT */ @@ -18,6 +18,9 @@ /* Absolute exclusive upper bound for the output of the forward NTT */ #define NTT_BOUND (8 * MLKEM_Q) +#define zetas MLKEM_NAMESPACE(zetas) +extern const int16_t zetas[128]; + /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1] @@ -38,520 +41,6 @@ typedef struct int16_t coeffs[MLKEM_N >> 1]; } poly_mulcache; -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) -#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) -#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) -#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) -#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) -#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) -#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) -#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) -#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) -#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) -/* End of static namespacing */ - -/************************************************************ - * Name: scalar_compress_d1 - * - * Description: Computes round(u * 2 / q) - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 1. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d1(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 2) - ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) -{ - uint32_t d0 = u << 1; - d0 *= 645083; - d0 += 1u << 30; - d0 >>= 31; - return d0; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_compress_d4 - * - * Description: Computes round(u * 16 / q) % 16 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d4(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 16) - ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) -{ - uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ - return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d4 - * - * Description: Computes round(u * q / 16) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d4(uint32_t u) -__contract__( - requires(0 <= u && u < 16) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 8) / 16; } - -/************************************************************ - * Name: scalar_compress_d5 - * - * Description: Computes round(u * 32 / q) % 32 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d5(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 32) - ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) -{ - uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ - return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d5 - * - * Description: Computes round(u * q / 32) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo 32 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d5(uint32_t u) -__contract__( - requires(0 <= u && u < 32) - ensures(return_value <= MLKEM_Q - 1) -) { return ((u * MLKEM_Q) + 16) / 32; } - -/************************************************************ - * Name: scalar_compress_d10 - * - * Description: Computes round(u * 2**10 / q) % 2**10 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d10(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 10)) - ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) -{ - uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x3FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d10 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d10(uint32_t u) -__contract__( - requires(0 <= u && u < 1024) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 512) / 1024; } - -/************************************************************ - * Name: scalar_compress_d11 - * - * Description: Computes round(u * 2**11 / q) % 2**11 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 11. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d11(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 11)) - ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) -{ - uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x7FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d11 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d11(uint32_t u) -__contract__( - requires(0 <= u && u < 2048) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 1024) / 2048; } - -/************************************************************ - * Name: scalar_signed_to_unsigned_q - * - * Description: converts signed polynomial coefficient - * from signed (-3328 .. 3328) form to - * unsigned form (0 .. 3328). - * - * Note: Cryptographic constant time implementation - * - * Examples: 0 -> 0 - * 1 -> 1 - * 3328 -> 3328 - * -1 -> 3328 - * -2 -> 3327 - * -3328 -> 1 - * - * Arguments: c: signed coefficient to be converted - ************************************************************/ -static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) -__contract__( - requires(c > -MLKEM_Q && c < MLKEM_Q) - ensures(return_value >= 0 && return_value < MLKEM_Q) - ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) -{ - debug_assert_abs_bound(&c, 1, MLKEM_Q); - - /* Add Q if c is negative, but in constant time */ - c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); - - /* and therefore cast to uint16_t is safe. */ - debug_assert_bound(&c, 1, 0, MLKEM_Q); - return (uint16_t)c; -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ - (MLKEM_K == 2 || MLKEM_K == 3) -#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) -/************************************************* - * Name: poly_compress_d4 - * - * Description: Compression (4 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); - -#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) -/************************************************* - * Name: poly_compress_d10 - * - * Description: Compression (10 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); - -#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) -/************************************************* - * Name: poly_decompress_d4 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); - -#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) -/************************************************* - * Name: poly_decompress_d10 - * - * Description: De-serialization and subsequent decompression (10 bits) of a - * polynomial; approximate inverse of poly_compress_d10 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) -/************************************************* - * Name: poly_compress_d5 - * - * Description: Compression (5 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); - -#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) -/************************************************* - * Name: poly_compress_d11 - * - * Description: Compression (11 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); - -#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) -/************************************************* - * Name: poly_decompress_d5 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); - -#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) -/************************************************* - * Name: poly_decompress_d11 - * - * Description: De-serialization and subsequent decompression (11 bits) of a - * polynomial; approximate inverse of poly_compress_d11 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ - */ - -#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) -/************************************************* - * Name: poly_tobytes - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - a: const pointer to input polynomial, - * with each coefficient in the range [0,1,..,Q-1] - * OUTPUT - * - r: pointer to output byte array - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYBYTES)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(r)) -); - - -#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) -/************************************************* - * Name: poly_frombytes - * - * Description: De-serialization of a polynomial. - * - * Arguments: INPUT - * - a: pointer to input byte array - * (of MLKEM_POLYBYTES bytes) - * OUTPUT - * - r: pointer to output polynomial, with - * each coefficient unsigned and in the range - * 0 .. 4095 - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) -); - - -#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) -/************************************************* - * Name: poly_frommsg - * - * Description: Convert 32-byte message to polynomial - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *msg: pointer to input message - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(object_whole(r)) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) -); - -#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) -/************************************************* - * Name: poly_tomsg - * - * Description: Convert polynomial to 32-byte message - * - * Arguments: - uint8_t *msg: pointer to output message - * - const poly *r: pointer to input polynomial - * Coefficients must be unsigned canonical - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(msg)) -); - #define poly_basemul_montgomery_cached \ MLKEM_NAMESPACE(poly_basemul_montgomery_cached) /************************************************* @@ -715,4 +204,56 @@ __contract__( assigns(object_whole(r)) ); +#define poly_ntt MLKEM_NAMESPACE(poly_ntt) +/************************************************* + * Name: poly_ntt + * + * Description: Computes negacyclic number-theoretic transform (NTT) of + * a polynomial in place. + * + * The input is assumed to be in normal order and + * coefficient-wise bound by MLKEM_Q in absolute value. + * + * The output polynomial is in bitreversed order, and + * coefficient-wise bound by NTT_BOUND in absolute value. + * + * (NOTE: Sometimes the input to the NTT is actually smaller, + * which gives better bounds.) + * + * Arguments: - poly *p: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) +); + +#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) +/************************************************* + * Name: poly_invntt_tomont + * + * Description: Computes inverse of negacyclic number-theoretic transform (NTT) + * of a polynomial in place; + * inputs assumed to be in bitreversed order, output in normal + * order + * + * The input is assumed to be in bitreversed order, and can + * have arbitrary coefficients in int16_t. + * + * The output polynomial is in normal order, and + * coefficient-wise bound by INVNTT_BOUND in absolute value. + * + * Arguments: - uint16_t *a: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) +); + #endif /* POLY_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly_k.c similarity index 97% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.c rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly_k.c index 50ea1c34a..c2d330ea9 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly_k.c @@ -2,13 +2,12 @@ * Copyright (c) 2024 The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 */ -#include "polyvec.h" +#include "poly_k.h" #include #include #include "arith_backend.h" -#include "cbd.h" -#include "ntt.h" -#include "poly.h" +#include "compress.h" +#include "sampling.h" #include "symmetric.h" #include "debug.h" @@ -131,7 +130,9 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, /* Omitting bounds assertion for cache since native implementations may * decide not to use a mulcache. Note that the C backend implementation * of poly_basemul_montgomery_cached() does still include the check. */ - polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache); + polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a, + (const int16_t *)b, + (const int16_t *)b_cache); } #endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly_k.h similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.h rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly_k.h index 8be8579e0..0aea95912 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly_k.h @@ -2,11 +2,12 @@ * Copyright (c) 2024 The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 */ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef POLY_K_H +#define POLY_K_H #include #include "common.h" +#include "compress.h" #include "poly.h" #define polyvec MLKEM_NAMESPACE_K(polyvec) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/reduce.h deleted file mode 100644 index b432a4201..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/reduce.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef REDUCE_H -#define REDUCE_H - -#include -#include "cbmc.h" -#include "common.h" -#include "debug.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) -#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) -#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) -#define fqmul MLKEM_NAMESPACE(fqmul) -#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) -/* End of static namespacing */ - -#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ - -/************************************************* - * Name: cast_uint16_to_int16 - * - * Description: Cast uint16 value to int16 - * - * Returns: - * input x in 0 .. 32767: returns value unchanged - * input x in 32768 .. 65535: returns (x - 65536) - **************************************************/ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "conversion" -#endif -ALWAYS_INLINE -static INLINE int16_t cast_uint16_to_int16(uint16_t x) -{ - /* - * PORTABILITY: This relies on uint16_t -> int16_t - * being implemented as the inverse of int16_t -> uint16_t, - * which is implementation-defined (C99 6.3.1.3 (3)) - * CBMC (correctly) fails to prove this conversion is OK, - * so we have to suppress that check here - */ - return (int16_t)x; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************* - * Name: montgomery_reduce_generic - * - * Description: Generic Montgomery reduction; given a 32-bit integer a, computes - * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 - * - * Arguments: - int32_t a: input integer to be reduced - * - * Returns: integer congruent to a * R^-1 modulo q, with absolute value - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 - * - **************************************************/ -ALWAYS_INLINE -static INLINE int16_t montgomery_reduce_generic(int32_t a) -{ - /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ - const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - - /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ - const uint16_t a_reduced = a & UINT16_MAX; - const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - - /* Lift to signed canonical representative mod 2^16. */ - const int16_t t = cast_uint16_to_int16(a_inverted); - - int32_t r = a - ((int32_t)t * MLKEM_Q); - /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - - /* - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - r = r >> 16; - /* Bounds: |r >> 16| <= ceil(|r| / 2^16) - * <= ceil(|a| / 2^16 + MLKEM_Q / 2) - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * - * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) - */ - - return (int16_t)r; -} - -/************************************************* - * Name: montgomery_reduce - * - * Description: Montgomery reduction - * - * Arguments: - int32_t a: input integer to be reduced - * Must be smaller than 2 * 2^12 * 2^15 in absolute value. - * - * Returns: integer congruent to a * R^-1 modulo q, - * smaller than 2 * q in absolute value. - **************************************************/ -static INLINE int16_t montgomery_reduce(int32_t a) -__contract__( - requires(a > -(2 * UINT12_LIMIT * 32768)) - requires(a < (2 * UINT12_LIMIT * 32768)) - ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); - - res = montgomery_reduce_generic(a); - /* Bounds: - * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 - * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 - * < 2 * MLKEM_Q */ - - debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); - return res; -} - -/************************************************* - * Name: fqmul - * - * Description: Montgomery multiplication modulo q=3329 - * - * Arguments: - int16_t a: first factor - * Can be any int16_t. - * - int16_t b: second factor. - * Must be signed canonical (abs value <(q+1)/2) - * - * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and - * smaller than q in absolute value. - * - **************************************************/ -static INLINE int16_t fqmul(int16_t a, int16_t b) -__contract__( - requires(b > -HALF_Q) - requires(b < HALF_Q) - ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&b, 1, HALF_Q); - - res = montgomery_reduce((int32_t)a * (int32_t)b); - /* Bounds: - * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 - * < MLKEM_Q - */ - - debug_assert_abs_bound(&res, 1, MLKEM_Q); - return res; -} - -/************************************************* - * Name: barrett_reduce - * - * Description: Barrett reduction; given a 16-bit integer a, computes - * centered representative congruent to a mod q in - * {-(q-1)/2,...,(q-1)/2} - * - * Arguments: - int16_t a: input integer to be reduced - * - * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. - **************************************************/ -static INLINE int16_t barrett_reduce(int16_t a) -__contract__( - ensures(return_value > -HALF_Q && return_value < HALF_Q) -) -{ - /* - * To divide by MLKEM_Q using Barrett multiplication, the "magic number" - * multiplier is round_to_nearest(2**26/MLKEM_Q) - */ - const int BPOWER = 26; - const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - - /* - * Compute round_to_nearest(a/MLKEM_Q) using the multiplier - * above and shift by BPOWER places. - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; - - /* - * t is in -10 .. +10, so we need 32-bit math to - * evaluate t * MLKEM_Q and the subsequent subtraction - */ - int16_t res = (int16_t)(a - t * MLKEM_Q); - - debug_assert_abs_bound(&res, 1, HALF_Q); - return res; -} - -#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/sampling.c similarity index 73% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.c rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/sampling.c index cbbe4407f..98cbdcb74 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/sampling.c @@ -9,7 +9,7 @@ #include "debug.h" #include "fips202.h" #include "fips202x4.h" -#include "rej_uniform.h" +#include "sampling.h" #include "symmetric.h" /* Static namespacing @@ -18,6 +18,8 @@ * within a single compilation unit. */ #define rej_uniform MLKEM_NAMESPACE(rej_uniform) #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar) +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) /* End of static namespacing */ static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target, @@ -233,9 +235,113 @@ void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) xof_release(&state); } +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) +/* End of static namespacing */ + +/************************************************* + * Name: load32_littleendian + * + * Description: load 4 bytes into a 32-bit integer + * in little-endian order + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x + **************************************************/ +static uint32_t load32_littleendian(const uint8_t x[4]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + r |= (uint32_t)x[3] << 24; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) + { + unsigned j; + uint32_t t = load32_littleendian(buf + 4 * i); + uint32_t d = t & 0x55555555; + d += (t >> 1) & 0x55555555; + + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) + { + const int16_t a = (d >> (4 * j + 0)) & 0x3; + const int16_t b = (d >> (4 * j + 2)) & 0x3; + r->coeffs[8 * i + j] = a - b; + } + } +} + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +/************************************************* + * Name: load24_littleendian + * + * Description: load 3 bytes into a 32-bit integer + * in little-endian order. + * This function is only needed for ML-KEM-512 + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) + **************************************************/ +static uint32_t load24_littleendian(const uint8_t x[3]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 4; i++) + __loop__( + invariant(i <= MLKEM_N / 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) + { + unsigned j; + const uint32_t t = load24_littleendian(buf + 3 * i); + uint32_t d = t & 0x00249249; + d += (t >> 1) & 0x00249249; + d += (t >> 2) & 0x00249249; + + for (j = 0; j < 4; j++) + __loop__( + invariant(i <= MLKEM_N / 4 && j <= 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) + { + const int16_t a = (d >> (6 * j + 0)) & 0x7; + const int16_t b = (d >> (6 * j + 3)) & 0x7; + r->coeffs[4 * i + j] = a - b; + } + } +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ + 3 */ + #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ -#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform) -int empty_cu_rej_uniform; +#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling) +int empty_cu_sampling; #endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/sampling.h similarity index 63% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.h rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/sampling.h index 801287259..cc524e0fc 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/sampling.h @@ -2,8 +2,8 @@ * Copyright (c) 2024 The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 */ -#ifndef REJ_UNIFORM_H -#define REJ_UNIFORM_H +#ifndef SAMPLING_H +#define SAMPLING_H #include #include @@ -11,6 +11,37 @@ #include "common.h" #include "poly.h" +#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) +/************************************************* + * Name: poly_cbd2 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=2 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) +/************************************************* + * Name: poly_cbd3 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=3. + * This function is only needed for ML-KEM-512 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ + #define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4) /************************************************* * Name: poly_rej_uniform_x4 @@ -60,4 +91,4 @@ __contract__( assigns(memory_slice(entry, sizeof(poly))) ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q))); -#endif /* REJ_UNIFORM_H */ +#endif /* SAMPLING_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c index 4ef887c62..987f0dce4 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c @@ -10,7 +10,7 @@ #include "common.h" #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) -#include "ntt.h" +#include "poly.h" /* * Table of zeta values used in the reference NTT and inverse NTT. diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/api.h deleted file mode 100644 index 792ecb8a4..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/api.h +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * Native arithmetic interface - * - * This header is primarily for documentation purposes. - * It should not be included by backend implementations. - * - * To ensure consistency with backends, the header will be - * included automatically after inclusion of the active - * backend, to ensure consistency of function signatures, - * and run sanity checks. - */ -#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H -#error \ - "The arithmetic backend API `mlkem/native/api.h` " \ - "should not be directly included. Please include the relevant " \ - "structure headers directly." -#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ -#define MLKEM_NATIVE_ARITH_NATIVE_API_H - -#include -#include "poly.h" -#include "polyvec.h" - -/* - * This is the C<->native interface allowing for the drop-in of - * native code for performance critical arithmetic components of ML-KEM. - * - * A _backend_ is a specific implementation of (part of) this interface. - * - * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and - * implement `static inline xxx(...)` in the profile header. - * - * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can - * be set if there are native implementations for all of NTT, invNTT, and - * base multiplication, and allows the native implementation to use a - * custom order of polynomial coefficients in NTT domain -- the use of such - * custom order is not an implementation-detail since the public matrix - * is generated in NTT domain. In this case, a permutation function - * poly_permute_bitrev_to_custom() needs to be provided that permutes - * polynomials in NTT domain from bitreversed to the custom order. - */ - -/* - * Those functions are meant to be trivial wrappers around the chosen native - * implementation. The are static inline to avoid unnecessary calls. - * The macro before each declaration controls whether a native - * implementation is present. - */ - -#if defined(MLKEM_USE_NATIVE_NTT) -/************************************************* - * Name: ntt_native - * - * Description: Computes negacyclic number-theoretic transform (NTT) of - * a polynomial in place. - * - * The input polynomial is assumed to be in normal order. - * The output polynomial is in bitreversed order, or of a - * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * - * Arguments: - poly *p: pointer to in/output polynomial - **************************************************/ -static INLINE void ntt_native(poly *); -#endif /* MLKEM_USE_NATIVE_NTT */ - -#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER) -/* - * This must only be set if NTT, invNTT, basemul, mulcache, and - * to/from byte stream conversions all have native implementations - * that are adapted to the custom order. - */ -#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \ - !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ - !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) || \ - !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) || \ - !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -#error \ - "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \ -set if there are native implementations for NTT, invNTT, mulcache, basemul, \ -and to/from bytes conversions." -#endif - -/************************************************* - * Name: poly_permute_bitrev_to_custom - * - * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined, - * convert a polynomial in NTT domain from bitreversed - * order to the custom order output by the native NTT. - * - * This must only be defined if there is native code for - * all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache. - * Arguments: - poly *p: pointer to in/output polynomial - * - **************************************************/ -static INLINE void poly_permute_bitrev_to_custom(poly *); -#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ - -#if defined(MLKEM_USE_NATIVE_INTT) -/************************************************* - * Name: intt_native - * - * Description: Computes inverse of negacyclic number-theoretic transform (NTT) - * of a polynomial in place. - * - * The input polynomial is in bitreversed order, or of a - * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * The output polynomial is assumed to be in normal order. - * - * Arguments: - uint16_t *a: pointer to in/output polynomial - **************************************************/ -static INLINE void intt_native(poly *); -#endif /* MLKEM_USE_NATIVE_INTT */ - -#if defined(MLKEM_USE_NATIVE_POLY_REDUCE) -/************************************************* - * Name: poly_reduce_native - * - * Description: Applies modular reduction to all coefficients of a polynomial. - * - * Arguments: - poly *r: pointer to input/output polynomial - **************************************************/ -static INLINE void poly_reduce_native(poly *); -#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ - -#if defined(MLKEM_USE_NATIVE_POLY_TOMONT) -/************************************************* - * Name: poly_tomont_native - * - * Description: Inplace conversion of all coefficients of a polynomial - * from normal domain to Montgomery domain - * - * Arguments: - poly *r: pointer to input/output polynomial - **************************************************/ -static INLINE void poly_tomont_native(poly *); -#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ - -#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) -/************************************************* - * Name: poly_mulcache_compute_native - * - * Description: Compute multiplication cache for a polynomial - * in NTT domain. - * - * The purpose of the multiplication cache is to - * cache repeated computations required during a - * base multiplication of polynomials in NTT domain. - * The structure of the multiplication-cache is - * implementation defined. - * - * Arguments: INPUT: - * - poly: const pointer to input polynomial. - * This must be in NTT domain and inin bitreversed order, or of - * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * OUTPUT - * - cache: pointer to multiplication cache - **************************************************/ -static INLINE void poly_mulcache_compute_native(poly_mulcache *cache, - const poly *poly); -#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ - -#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) -/************************************************* - * Name: poly_mulcache_compute_native - * - * Description: Compute multiplication of polynomials in NTT domain. - * - * Arguments: INPUT: - * - a: First polynomial operand. - * This must be in NTT domain and inin bitreversed order, or of - * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * - b: Second polynomial operand. - * As for a. - * - b_cache: Multiplication-cache for b. - * OUTPUT - * - r: Result of the base multiplication. This is again - * in NTT domain, and of the same order as a and b. - **************************************************/ -static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache); -#endif - -#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES) -/************************************************* - * Name: poly_tobytes_native - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - a: const pointer to input polynomial, - * with each coefficient in the range -Q+1 .. Q-1 - * OUTPUT - * - r: pointer to output byte array - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a); -#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ - -#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -/************************************************* - * Name: poly_frombytes_native - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - r: pointer to output polynomial in NTT domain - * OUTPUT - * - a: const pointer to input byte aray - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -static INLINE void poly_frombytes_native(poly *a, - const uint8_t r[MLKEM_POLYBYTES]); -#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ - -#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM) -/************************************************* - * Name: rej_uniform_native - * - * Description: Run rejection sampling on uniform random bytes to generate - * uniform random integers mod q - * - * Arguments: - int16_t *r: pointer to output buffer - * - unsigned int len: requested number of 16-bit integers - * (uniform mod q). - * - const uint8_t *buf: pointer to input buffer - * (assumed to be uniform random bytes) - * - unsigned int buflen: length of input buffer in bytes. - * - * Return -1 if the native implementation does not support the input lengths. - * Otherwise, returns non-negative number of sampled 16-bit integers (at most - * len). - **************************************************/ -static INLINE int rej_uniform_native(int16_t *r, unsigned int len, - const uint8_t *buf, unsigned int buflen); -#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ - -#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_backend.h index 0543b1bd1..ade31cda1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_backend.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_backend.h @@ -17,7 +17,7 @@ * Keep this _after_ the inclusion of the backend; otherwise, * the sanity checks won't have an effect. */ #if defined(MLKEM_NATIVE_CHECK_APIS) -#include "api.h" +#include "native/api.h" #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.c deleted file mode 100644 index 1e6b7c5d1..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED - -#include -#include "cbd.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) -#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) -/* End of static namespacing */ - -/************************************************* - * Name: load32_littleendian - * - * Description: load 4 bytes into a 32-bit integer - * in little-endian order - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x - **************************************************/ -static uint32_t load32_littleendian(const uint8_t x[4]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - r |= (uint32_t)x[3] << 24; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) - { - unsigned j; - uint32_t t = load32_littleendian(buf + 4 * i); - uint32_t d = t & 0x55555555; - d += (t >> 1) & 0x55555555; - - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) - { - const int16_t a = (d >> (4 * j + 0)) & 0x3; - const int16_t b = (d >> (4 * j + 2)) & 0x3; - r->coeffs[8 * i + j] = a - b; - } - } -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -/************************************************* - * Name: load24_littleendian - * - * Description: load 3 bytes into a 32-bit integer - * in little-endian order. - * This function is only needed for ML-KEM-512 - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) - **************************************************/ -static uint32_t load24_littleendian(const uint8_t x[3]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 4; i++) - __loop__( - invariant(i <= MLKEM_N / 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) - { - unsigned j; - const uint32_t t = load24_littleendian(buf + 3 * i); - uint32_t d = t & 0x00249249; - d += (t >> 1) & 0x00249249; - d += (t >> 2) & 0x00249249; - - for (j = 0; j < 4; j++) - __loop__( - invariant(i <= MLKEM_N / 4 && j <= 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) - { - const int16_t a = (d >> (6 * j + 0)) & 0x7; - const int16_t b = (d >> (6 * j + 3)) & 0x7; - r->coeffs[4 * i + j] = a - b; - } - } -} -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ - 3 */ - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd) -int empty_cu_cbd; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.h deleted file mode 100644 index 54c1f5b90..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef CBD_H -#define CBD_H - -#include -#include "common.h" -#include "poly.h" - -#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) -/************************************************* - * Name: poly_cbd2 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=2 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) -/************************************************* - * Name: poly_cbd3 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=3. - * This function is only needed for ML-KEM-512 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ - -#endif /* CBD_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h index 4f326333e..62ed53ab1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h @@ -15,12 +15,19 @@ #include "sys.h" /* Include backend metadata */ -#if defined(MLKEM_USE_NATIVE) -#if defined(MLKEM_NATIVE_ARITH_BACKEND) -#include MLKEM_NATIVE_ARITH_BACKEND +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#include MLKEM_NATIVE_ARITH_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not. +#endif #endif -#if defined(MLKEM_NATIVE_FIPS202_BACKEND) -#include MLKEM_NATIVE_FIPS202_BACKEND + +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#include MLKEM_NATIVE_FIPS202_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not. #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/compress.c new file mode 100644 index 000000000..a03fe0ac4 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/compress.c @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "common.h" +#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) + +#include +#include +#include "arith_backend.h" +#include "cbmc.h" +#include "compress.h" +#include "debug.h" +#include "verify.h" + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 16))) + { + t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); + } + + r[i * 4] = t[0] | (t[1] << 4); + r[i * 4 + 1] = t[2] | (t[3] << 4); + r[i * 4 + 2] = t[4] | (t[5] << 4); + r[i * 4 + 3] = t[6] | (t[7] << 4); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + for (j = 0; j < MLKEM_N / 4; j++) + __loop__(invariant(j <= MLKEM_N / 4)) + { + unsigned k; + uint16_t t[4]; + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(forall(r, 0, k, t[r] < (1u << 10)))) + { + t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 10-bit in size. + */ + r[5 * j + 0] = (t[0] >> 0) & 0xFF; + r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); + r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); + r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); + r[5 * j + 4] = (t[3] >> 2); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) + { + r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); + r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 4; j++) + __loop__( + invariant(j <= MLKEM_N / 4) + invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[4]; + uint8_t const *base = &a[5 * j]; + + t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); + t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); + t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); + + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) + { + r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 32))) + { + t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); + } + + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC, and use array indexing into + * r rather than pointer-arithmetic to simplify verification + */ + r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); + r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); + r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); + r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); + r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (j = 0; j < MLKEM_N / 8; j++) + __loop__(invariant(j <= MLKEM_N / 8)) + { + unsigned k; + uint16_t t[8]; + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(forall(r, 0, k, t[r] < (1u << 11)))) + { + t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 11-bit in size. + */ + r[11 * j + 0] = (t[0] >> 0) & 0xFF; + r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); + r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); + r[11 * j + 3] = (t[2] >> 2) & 0xFF; + r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); + r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); + r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); + r[11 * j + 7] = (t[5] >> 1) & 0xFF; + r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); + r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); + r[11 * j + 10] = (t[7] >> 3); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + uint8_t t[8]; + const unsigned offset = i * 5; + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC and unwind loop for ease + * of proof. + */ + + /* + * Decompress 5 8-bit bytes (so 40 bits) into + * 8 5-bit values stored in t[] + */ + t[0] = 0x1F & (a[offset + 0] >> 0); + t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); + t[2] = 0x1F & (a[offset + 1] >> 2); + t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); + t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); + t[5] = 0x1F & (a[offset + 3] >> 1); + t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); + t[7] = 0x1F & (a[offset + 4] >> 3); + + /* and copy to the correct slice in r[] */ + for (j = 0; j < 8; j++) + __loop__( + invariant(j <= 8 && i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 8; j++) + __loop__( + invariant(j <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[8]; + uint8_t const *base = &a[11 * j]; + t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); + t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | + ((uint16_t)base[4] << 10)); + t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); + t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); + t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | + ((uint16_t)base[8] << 9)); + t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); + t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); + + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) + { + r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ + +#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 2; i++) + __loop__(invariant(i <= MLKEM_N / 2)) + { + const uint16_t t0 = a->coeffs[2 * i]; + const uint16_t t1 = a->coeffs[2 * i + 1]; + /* + * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of + * significant data, so these can be packed into 24 bits or exactly + * 3 bytes, as follows. + */ + + /* Least significant bits 0 - 7 of t0. */ + r[3 * i + 0] = t0 & 0xFF; + + /* + * Most significant bits 8 - 11 of t0 become the least significant + * nibble of the second byte. The least significant 4 bits + * of t1 become the upper nibble of the second byte. + */ + r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + + /* Bits 4 - 11 of t1 become the third byte. */ + r[3 * i + 2] = t1 >> 4; + } +} +#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + poly_tobytes_native(r, a->coeffs); +} +#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + +#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) + { + const uint8_t t0 = a[3 * i + 0]; + const uint8_t t1 = a[3 * i + 1]; + const uint8_t t2 = a[3 * i + 2]; + r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); + r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); + } + + /* Note that the coefficients are not canonical */ + debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); +} +#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + poly_frombytes_native(r->coeffs, a); +} +#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ + +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +{ + unsigned i; +#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) +#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" +#endif + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + for (j = 0; j < 8; j++) + __loop__( + invariant(i < MLKEM_N / 8 && j <= 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + /* Prevent the compiler from recognizing this as a bit selection */ + uint8_t mask = value_barrier_u8(1u << j); + r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); + } + } + debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + msg[i] = 0; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8)) + { + uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); + msg[i] |= t << j; + } + } +} + +#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ + +#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress) +int empty_cu_compress; + +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/compress.h new file mode 100644 index 000000000..409dbe519 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/compress.h @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef COMPRESS_H +#define COMPRESS_H + +#include +#include +#include "cbmc.h" +#include "common.h" +#include "debug.h" +#include "poly.h" +#include "verify.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) +#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) +#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) +#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) +#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) +#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) +#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) +#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) +#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) +/* End of static namespacing */ + +/************************************************************ + * Name: scalar_compress_d1 + * + * Description: Computes round(u * 2 / q) + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 1. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d1(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 2) + ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) +{ + uint32_t d0 = u << 1; + d0 *= 645083; + d0 += 1u << 30; + d0 >>= 31; + return d0; +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_compress_d4 + * + * Description: Computes round(u * 16 / q) % 16 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d4(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 16) + ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) +{ + uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ + return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d4 + * + * Description: Computes round(u * q / 16) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d4(uint32_t u) +__contract__( + requires(0 <= u && u < 16) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 8) / 16; } + +/************************************************************ + * Name: scalar_compress_d5 + * + * Description: Computes round(u * 32 / q) % 32 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d5(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 32) + ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) +{ + uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ + return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d5 + * + * Description: Computes round(u * q / 32) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo 32 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d5(uint32_t u) +__contract__( + requires(0 <= u && u < 32) + ensures(return_value <= MLKEM_Q - 1) +) { return ((u * MLKEM_Q) + 16) / 32; } + +/************************************************************ + * Name: scalar_compress_d10 + * + * Description: Computes round(u * 2**10 / q) % 2**10 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d10(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 10)) + ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) +{ + uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x3FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d10 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d10(uint32_t u) +__contract__( + requires(0 <= u && u < 1024) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 512) / 1024; } + +/************************************************************ + * Name: scalar_compress_d11 + * + * Description: Computes round(u * 2**11 / q) % 2**11 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 11. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d11(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 11)) + ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) +{ + uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x7FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d11 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d11(uint32_t u) +__contract__( + requires(0 <= u && u < 2048) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 1024) / 2048; } + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ + (MLKEM_K == 2 || MLKEM_K == 3) +#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) +/************************************************* + * Name: poly_compress_d4 + * + * Description: Compression (4 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); + +#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) +/************************************************* + * Name: poly_compress_d10 + * + * Description: Compression (10 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); + +#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) +/************************************************* + * Name: poly_decompress_d4 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); + +#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) +/************************************************* + * Name: poly_decompress_d10 + * + * Description: De-serialization and subsequent decompression (10 bits) of a + * polynomial; approximate inverse of poly_compress_d10 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) +/************************************************* + * Name: poly_compress_d5 + * + * Description: Compression (5 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); + +#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) +/************************************************* + * Name: poly_compress_d11 + * + * Description: Compression (11 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); + +#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) +/************************************************* + * Name: poly_decompress_d5 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); + +#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) +/************************************************* + * Name: poly_decompress_d11 + * + * Description: De-serialization and subsequent decompression (11 bits) of a + * polynomial; approximate inverse of poly_compress_d11 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ + */ + +#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) +/************************************************* + * Name: poly_tobytes + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - a: const pointer to input polynomial, + * with each coefficient in the range [0,1,..,Q-1] + * OUTPUT + * - r: pointer to output byte array + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYBYTES)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(r)) +); + + +#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) +/************************************************* + * Name: poly_frombytes + * + * Description: De-serialization of a polynomial. + * + * Arguments: INPUT + * - a: pointer to input byte array + * (of MLKEM_POLYBYTES bytes) + * OUTPUT + * - r: pointer to output polynomial, with + * each coefficient unsigned and in the range + * 0 .. 4095 + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) +); + + +#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) +/************************************************* + * Name: poly_frommsg + * + * Description: Convert 32-byte message to polynomial + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *msg: pointer to input message + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(object_whole(r)) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) +); + +#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) +/************************************************* + * Name: poly_tomsg + * + * Description: Convert polynomial to 32-byte message + * + * Arguments: - uint8_t *msg: pointer to output message + * - const poly *r: pointer to input polynomial + * Coefficients must be unsigned canonical + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(msg)) +); + +#endif /* COMPRESS_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h index fa89370ce..e975ede95 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h @@ -122,46 +122,87 @@ /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ /****************************************************************************** - * Name: MLKEM_USE_NATIVE + * Name: MLKEM_USE_NATIVE_BACKEND_ARITH * - * Description: Determines whether a native backend should - * be used, if available. + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. * * This can also be set using CFLAGS. * *****************************************************************************/ -#if !defined(MLKEM_USE_NATIVE) -/* #define MLKEM_USE_NATIVE */ +#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */ #endif /****************************************************************************** - * Name: MLKEM_NATIVE_ARITH_BACKEND + * Name: MLKEM_NATIVE_ARITH_BACKEND_FILE * * Description: The arithmetic backend to use. * - * This must be the filename of an arithmetic backend. - * See the existing backends for examples. + * If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND) -#define MLKEM_NATIVE_ARITH_BACKEND "default.h" -#endif /* MLKEM_NATIVE_ARITH_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \ + !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h" +#endif /****************************************************************************** - * Name: MLKEM_NATIVE_FIPS202_BACKEND + * Name: MLKEM_USE_NATIVE_BACKEND_FIPS202 + * + * Description: Determines whether an native FIPS202 backend should be used. + * + * The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is + * the performance bottleneck of SHA3 and SHAKE. + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the FIPS202 backend to be use is + * determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */ +#endif + +/****************************************************************************** + * Name: MLKEM_NATIVE_FIPS202_BACKEND_FILE * * Description: The FIPS-202 backend to use. * - * This must be the filename of an FIPS-202 backend. + * If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must + * either be undefined or the filename of a FIPS202 backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND) -#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h" -#endif /* MLKEM_NATIVE_FIPS202_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \ + !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h" +#endif /************************* Config internals ********************************/ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/default.h deleted file mode 100644 index d1e41c52e..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/default.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H -#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H - -/* - * Default arithmetic backend - */ -#include "sys.h" - -#ifdef SYS_AARCH64 -/* - * For AArch64, we currently we have one clean and one opt profile. - * We default to the opt profile. - * - * In the future, this may branch further depending on the microarchitecture. - */ -#include "aarch64/opt.h" -#endif /* SYS_AARCH64 */ - -#ifdef SYS_X86_64_AVX2 -/* - * For now, there's only one x86_64 profile, based on - * the AVX2 code from the Kyber repository. - * https://github.com/pq-crystals/kyber - */ -#include "x86_64/default.h" -#endif /* SYS_X86_64 */ - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c index 0cfcc3e9e..318d0fc77 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c @@ -9,11 +9,10 @@ #include "fips202.h" #include "fips202x4.h" #include "indcpa.h" -#include "ntt.h" #include "poly.h" -#include "polyvec.h" +#include "poly_k.h" #include "randombytes.h" -#include "rej_uniform.h" +#include "sampling.h" #include "symmetric.h" #include "arith_backend.h" @@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v, #define poly_permute_bitrev_to_custom \ MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom) -static INLINE void poly_permute_bitrev_to_custom(poly *data) +static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N]) __contract__( /* We don't specify that this should be a permutation, but only * that it does not change the bound established at the end of gen_matrix. */ - requires(memory_no_alias(data, sizeof(poly))) - requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N)) + requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q)) assigns(memory_slice(data, sizeof(poly))) - ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } + ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ /* Not static for benchmarking */ @@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed) { for (j = 0; j < MLKEM_K; j++) { - poly_permute_bitrev_to_custom(&a[i].vec[j]); + poly_permute_bitrev_to_custom(a[i].vec[j].coeffs); } } } diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h index 2c4fda3c4..b4d5985bf 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h @@ -8,7 +8,7 @@ #include #include "cbmc.h" #include "common.h" -#include "polyvec.h" +#include "poly_k.h" #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix) /************************************************* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/api.h new file mode 100644 index 000000000..0704f9dcd --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/api.h @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * Native arithmetic interface + * + * This header is primarily for documentation purposes. + * It should not be included by backend implementations. + * + * To ensure consistency with backends, the header will be + * included automatically after inclusion of the active + * backend, to ensure consistency of function signatures, + * and run sanity checks. + */ +#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H +#error \ + "The arithmetic backend API `mlkem/native/api.h` " \ + "should not be directly included. Please include the relevant " \ + "structure headers directly." +#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ +#define MLKEM_NATIVE_ARITH_NATIVE_API_H + +#include +#include "../common.h" + +/* + * This is the C<->native interface allowing for the drop-in of + * native code for performance critical arithmetic components of ML-KEM. + * + * A _backend_ is a specific implementation of (part of) this interface. + * + * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and + * implement `static inline xxx(...)` in the profile header. + * + * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can + * be set if there are native implementations for all of NTT, invNTT, and + * base multiplication, and allows the native implementation to use a + * custom order of polynomial coefficients in NTT domain -- the use of such + * custom order is not an implementation-detail since the public matrix + * is generated in NTT domain. In this case, a permutation function + * poly_permute_bitrev_to_custom() needs to be provided that permutes + * polynomials in NTT domain from bitreversed to the custom order. + */ + +/* + * Those functions are meant to be trivial wrappers around the chosen native + * implementation. The are static inline to avoid unnecessary calls. + * The macro before each declaration controls whether a native + * implementation is present. + */ + +#if defined(MLKEM_USE_NATIVE_NTT) +/************************************************* + * Name: ntt_native + * + * Description: Computes negacyclic number-theoretic transform (NTT) of + * a polynomial in place. + * + * The input polynomial is assumed to be in normal order. + * The output polynomial is in bitreversed order, or of a + * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial + **************************************************/ +static INLINE void ntt_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_NTT */ + +#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER) +/* + * This must only be set if NTT, invNTT, basemul, mulcache, and + * to/from byte stream conversions all have native implementations + * that are adapted to the custom order. + */ +#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) || \ + !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) || \ + !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +#error \ + "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \ +set if there are native implementations for NTT, invNTT, mulcache, basemul, \ +and to/from bytes conversions." +#endif + +/************************************************* + * Name: poly_permute_bitrev_to_custom + * + * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined, + * convert a polynomial in NTT domain from bitreversed + * order to the custom order output by the native NTT. + * + * This must only be defined if there is native code for + * all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache. + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial + * + **************************************************/ +static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ + +#if defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: intt_native + * + * Description: Computes inverse of negacyclic number-theoretic transform (NTT) + * of a polynomial in place. + * + * The input polynomial is in bitreversed order, or of a + * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * The output polynomial is assumed to be in normal order. + * + * Arguments: - uint16_t *a: pointer to in/output polynomial + **************************************************/ +static INLINE void intt_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_INTT */ + +#if defined(MLKEM_USE_NATIVE_POLY_REDUCE) +/************************************************* + * Name: poly_reduce_native + * + * Description: Applies modular reduction to all coefficients of a polynomial. + * + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial + **************************************************/ +static INLINE void poly_reduce_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ + +#if defined(MLKEM_USE_NATIVE_POLY_TOMONT) +/************************************************* + * Name: poly_tomont_native + * + * Description: Inplace conversion of all coefficients of a polynomial + * from normal domain to Montgomery domain + * + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial + **************************************************/ +static INLINE void poly_tomont_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ + +#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) +/************************************************* + * Name: poly_mulcache_compute_native + * + * Description: Compute multiplication cache for a polynomial + * in NTT domain. + * + * The purpose of the multiplication cache is to + * cache repeated computations required during a + * base multiplication of polynomials in NTT domain. + * The structure of the multiplication-cache is + * implementation defined. + * + * Arguments: INPUT: + * - poly: const pointer to input polynomial. + * This must be in NTT domain and inin bitreversed order, or of + * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * OUTPUT + * - cache: pointer to multiplication cache + **************************************************/ +static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2], + const int16_t poly[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ + +#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) +/************************************************* + * Name: poly_mulcache_compute_native + * + * Description: Compute multiplication of polynomials in NTT domain. + * + * Arguments: INPUT: + * - a: First polynomial operand. + * This must be in NTT domain and inin bitreversed order, or of + * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * - b: Second polynomial operand. + * As for a. + * - b_cache: Multiplication-cache for b. + * OUTPUT + * - r: Result of the base multiplication. This is again + * in NTT domain, and of the same order as a and b. + **************************************************/ +static INLINE void polyvec_basemul_acc_montgomery_cached_native( + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]); +#endif + +#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES) +/************************************************* + * Name: poly_tobytes_native + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - a: const pointer to input polynomial, + * with each coefficient in the range -Q+1 .. Q-1 + * OUTPUT + * - r: pointer to output byte array + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], + const int16_t a[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + +#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +/************************************************* + * Name: poly_frombytes_native + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - r: pointer to output polynomial in NTT domain + * OUTPUT + * - a: const pointer to input byte aray + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +static INLINE void poly_frombytes_native(int16_t a[MLKEM_N], + const uint8_t r[MLKEM_POLYBYTES]); +#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ + +#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM) +/************************************************* + * Name: rej_uniform_native + * + * Description: Run rejection sampling on uniform random bytes to generate + * uniform random integers mod q + * + * Arguments: - int16_t *r: pointer to output buffer + * - unsigned int len: requested number of 16-bit integers + * (uniform mod q). + * - const uint8_t *buf: pointer to input buffer + * (assumed to be uniform random bytes) + * - unsigned int buflen: length of input buffer in bytes. + * + * Return -1 if the native implementation does not support the input lengths. + * Otherwise, returns non-negative number of sampled 16-bit integers (at most + * len). + **************************************************/ +static INLINE int rej_uniform_native(int16_t *r, unsigned int len, + const uint8_t *buf, unsigned int buflen); +#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ + +#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/default.h new file mode 100644 index 000000000..f9fe4310a --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/default.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H +#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H + +/* + * Default arithmetic backend + */ +#include "../sys.h" + +#ifdef SYS_AARCH64 +/* + * For AArch64, we currently we have one clean and one opt profile. + * We default to the opt profile. + * + * In the future, this may branch further depending on the microarchitecture. + */ +#include "aarch64/opt.h" +#endif /* SYS_AARCH64 */ + +#ifdef SYS_X86_64_AVX2 +/* + * For now, there's only one x86_64 profile, based on + * the AVX2 code from the Kyber repository. + * https://github.com/pq-crystals/kyber + */ +#include "x86_64/default.h" +#endif /* SYS_X86_64 */ + +#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.c deleted file mode 100644 index 3651c8da9..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) - -#include -#include "arith_backend.h" -#include "debug.h" -#include "ntt.h" -#include "reduce.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) -#define ntt_layer MLKEM_NAMESPACE(ntt_layer) -#define invntt_layer MLKEM_NAMESPACE(invntt_layer) -/* End of static namespacing */ - -#if !defined(MLKEM_USE_NATIVE_NTT) -/* - * Computes a block CT butterflies with a fixed twiddle factor, - * using Montgomery multiplication. - * Parameters: - * - r: Pointer to base of polynomial (_not_ the base of butterfly block) - * - root: Twiddle factor to use for the butterfly. This must be in - * Montgomery form and signed canonical. - * - start: Offset to the beginning of the butterfly block - * - len: Index difference between coefficients subject to a butterfly - * - bound: Ghost variable describing coefficient bound: Prior to `start`, - * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, - * they must be bound by `bound`. - * When this function returns, output coefficients in the index range - * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. - * Example: - * - start=8, len=4 - * This would compute the following four butterflies - * 8 -- 12 - * 9 -- 13 - * 10 -- 14 - * 11 -- 15 - * - start=4, len=2 - * This would compute the following two butterflies - * 4 -- 6 - * 5 -- 7 - */ -static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, - unsigned start, unsigned len, int bound) -__contract__( - requires(start < MLKEM_N) - requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) - requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) - requires(-HALF_Q < zeta && zeta < HALF_Q) - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) - requires(array_abs_bound(r, start, MLKEM_N, bound)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) - ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) -{ - /* `bound` is a ghost variable only needed in the CBMC specification */ - unsigned j; - ((void)bound); - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - /* - * Coefficients are updated in strided pairs, so the bounds for the - * intermediate states alternate twice between the old and new bound - */ - invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j, start + len, bound)) - invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) - { - int16_t t; - t = fqmul(r[j + len], zeta); - r[j + len] = r[j] - t; - r[j] = r[j] + t; - } -} - -/* - *Compute one layer of forward NTT - * Parameters: - * - r: Pointer to base of polynomial - * - len: Stride of butterflies in this layer. - * - layer: Ghost variable indicating which layer is being applied. - * Must match `len` via `len == MLKEM_N >> layer`. - * Note: `len` could be dropped and computed in the function, but - * we are following the structure of the reference NTT from the - * official Kyber implementation here, merely adding `layer` as - * a ghost variable for the specifications. - */ -static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) - requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable only needed in the CBMC specification */ - ((void)layer); - /* Twiddle factors for layer n start at index 2^(layer-1) */ - k = MLKEM_N / (2 * len); - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(start < MLKEM_N + 2 * len) - invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) - invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) - invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) - { - int16_t zeta = zetas[k++]; - ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); - } -} - -/* - * Compute full forward NTT - * NOTE: This particular implementation satisfies a much tighter - * bound on the output coefficients (5*q) than the contractual one (8*q), - * but this is not needed in the calling code. Should we change the - * base multiplication strategy to require smaller NTT output bounds, - * the proof may need strengthening. - */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - unsigned len, layer; - int16_t *r; - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - r = p->coeffs; - - for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) - __loop__( - invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) - invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) - { - ntt_layer(r, len, layer); - } - - /* Check the stronger bound */ - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_NTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - ntt_native(p); - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_NTT */ - -#if !defined(MLKEM_USE_NATIVE_INTT) - -/* Compute one layer of inverse NTT */ -static void invntt_layer(int16_t *r, unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) - requires(len == (1 << (8 - layer))) - requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable used only in the specification */ - ((void)layer); - k = MLKEM_N / len - 1; - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - invariant(start <= MLKEM_N && k <= 127) - /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ - invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) - { - unsigned j; - int16_t zeta = zetas[k--]; - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - invariant(start <= MLKEM_N && k <= 127) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - int16_t t = r[j]; - r[j] = barrett_reduce(t + r[j + len]); - r[j + len] = r[j + len] - t; - r[j + len] = fqmul(r[j + len], zeta); - } - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - /* - * Scale input polynomial to account for Montgomery factor - * and NTT twist. This also brings coefficients down to - * absolute value < MLKEM_Q. - */ - unsigned j, len, layer; - const int16_t f = 1441; - int16_t *r = p->coeffs; - - for (j = 0; j < MLKEM_N; j++) - __loop__( - invariant(j <= MLKEM_N) - invariant(array_abs_bound(r, 0, j, MLKEM_Q))) - { - r[j] = fqmul(r[j], f); - } - - /* Run the invNTT layers */ - for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) - __loop__( - invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - invntt_layer(p->coeffs, len, layer); - } - - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - intt_native(p); - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -{ - int32_t t0, t1; - debug_assert_bound(a, 2, 0, UINT12_LIMIT); - - t0 = (int32_t)a[1] * b_cached; - t0 += (int32_t)a[0] * b[0]; - t1 = (int32_t)a[0] * b[1]; - t1 += (int32_t)a[1] * b[0]; - - /* |ti| < 2 * q * 2^15 */ - r[0] = montgomery_reduce(t0); - r[1] = montgomery_reduce(t1); - - debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); -} - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt) -int empty_cu_ntt; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.h deleted file mode 100644 index 4e80d3ab3..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef NTT_H -#define NTT_H -#include "common.h" - -#include -#include "cbmc.h" -#include "poly.h" -#include "reduce.h" - -#define zetas MLKEM_NAMESPACE(zetas) -extern const int16_t zetas[128]; - -#define poly_ntt MLKEM_NAMESPACE(poly_ntt) -/************************************************* - * Name: poly_ntt - * - * Description: Computes negacyclic number-theoretic transform (NTT) of - * a polynomial in place. - * - * The input is assumed to be in normal order and - * coefficient-wise bound by MLKEM_Q in absolute value. - * - * The output polynomial is in bitreversed order, and - * coefficient-wise bound by NTT_BOUND in absolute value. - * - * (NOTE: Sometimes the input to the NTT is actually smaller, - * which gives better bounds.) - * - * Arguments: - poly *p: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) -); - -#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) -/************************************************* - * Name: poly_invntt_tomont - * - * Description: Computes inverse of negacyclic number-theoretic transform (NTT) - * of a polynomial in place; - * inputs assumed to be in bitreversed order, output in normal - * order - * - * The input is assumed to be in bitreversed order, and can - * have arbitrary coefficients in int16_t. - * - * The output polynomial is in normal order, and - * coefficient-wise bound by INVNTT_BOUND in absolute value. - * - * Arguments: - uint16_t *a: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) -); - -#define basemul_cached MLKEM_NAMESPACE(basemul_cached) -/************************************************************ - * Name: basemul_cached - * - * Description: Computes a representative modulo q of - * (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536 - * - * If b_cached is b1*zeta, this represents the - * product of (a0 + a1*X) and (b0 + b1*X) in - * Fq[X]/(X^2 - zeta). - * - * Arguments: - r: Pointer to output polynomial - * Upon return, coefficients are bound by - * 2*MLKEM_Q in absolute value. - * - a: Pointer to first input polynomial - * Every coefficient must be in [0..4095] - * - b: Pointer to second input polynomial - * Can have arbitrary int16_t coefficients - * - b_cached: Some precomputed value, typically derived from - * b1 and a twiddle factor. Can be an arbitary int16_t. - ************************************************************/ -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -__contract__( - requires(memory_no_alias(r, 2 * sizeof(int16_t))) - requires(memory_no_alias(a, 2 * sizeof(int16_t))) - requires(memory_no_alias(b, 2 * sizeof(int16_t))) - requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) - assigns(memory_slice(r, 2 * sizeof(int16_t))) - ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)) -); - -#endif /* NTT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h index 57ea4c8ba..7f6c12625 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h @@ -18,6 +18,7 @@ #define MLKEM_N 256 #define MLKEM_Q 3329 #define UINT12_LIMIT 4096 +#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define MLKEM_SSBYTES 32 /* size in bytes of shared key */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c index 7483ebf6d..e8a2e2c6e 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c @@ -8,388 +8,246 @@ #include #include #include "arith_backend.h" -#include "cbd.h" #include "cbmc.h" #include "debug.h" #include "fips202x4.h" -#include "ntt.h" #include "poly.h" -#include "reduce.h" +#include "sampling.h" #include "symmetric.h" #include "verify.h" -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 16))) - { - t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); - } - - r[i * 4] = t[0] | (t[1] << 4); - r[i * 4 + 1] = t[2] | (t[3] << 4); - r[i * 4 + 2] = t[4] | (t[5] << 4); - r[i * 4 + 3] = t[6] | (t[7] << 4); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - for (j = 0; j < MLKEM_N / 4; j++) - __loop__(invariant(j <= MLKEM_N / 4)) - { - unsigned k; - uint16_t t[4]; - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(forall(r, 0, k, t[r] < (1u << 10)))) - { - t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); - } - - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 10-bit in size. - */ - r[5 * j + 0] = (t[0] >> 0) & 0xFF; - r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); - r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); - r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); - r[5 * j + 4] = (t[3] >> 2); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) - { - r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); - r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) +#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) +#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) +#define fqmul MLKEM_NAMESPACE(fqmul) +#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) +#define basemul_cached MLKEM_NAMESPACE(basemul_cached) +#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) +#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) +#define ntt_layer MLKEM_NAMESPACE(ntt_layer) +#define invntt_layer MLKEM_NAMESPACE(invntt_layer) +/* End of static namespacing */ + +/************************************************* + * Name: cast_uint16_to_int16 + * + * Description: Cast uint16 value to int16 + * + * Returns: + * input x in 0 .. 32767: returns value unchanged + * input x in 32768 .. 65535: returns (x - 65536) + **************************************************/ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "conversion" +#endif +ALWAYS_INLINE +static INLINE int16_t cast_uint16_to_int16(uint16_t x) { - unsigned j; - for (j = 0; j < MLKEM_N / 4; j++) - __loop__( - invariant(j <= MLKEM_N / 4) - invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[4]; - uint8_t const *base = &a[5 * j]; - - t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); - t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); - t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); - - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) - { - r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + /* + * PORTABILITY: This relies on uint16_t -> int16_t + * being implemented as the inverse of int16_t -> uint16_t, + * which is implementation-defined (C99 6.3.1.3 (3)) + * CBMC (correctly) fails to prove this conversion is OK, + * so we have to suppress that check here + */ + return (int16_t)x; } -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ +#ifdef CBMC +#pragma CPROVER check pop +#endif -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +/************************************************* + * Name: montgomery_reduce_generic + * + * Description: Generic Montgomery reduction; given a 32-bit integer a, computes + * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 + * + * Arguments: - int32_t a: input integer to be reduced + * + * Returns: integer congruent to a * R^-1 modulo q, with absolute value + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 + * + **************************************************/ +ALWAYS_INLINE +static INLINE int16_t montgomery_reduce_generic(int32_t a) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ + const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 32))) - { - t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); - } + /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ + const uint16_t a_reduced = a & UINT16_MAX; + const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC, and use array indexing into - * r rather than pointer-arithmetic to simplify verification - */ - r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); - r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); - r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); - r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); - r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); - } -} + /* Lift to signed canonical representative mod 2^16. */ + const int16_t t = cast_uint16_to_int16(a_inverted); -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + int32_t r = a - ((int32_t)t * MLKEM_Q); + /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - for (j = 0; j < MLKEM_N / 8; j++) - __loop__(invariant(j <= MLKEM_N / 8)) - { - unsigned k; - uint16_t t[8]; - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(forall(r, 0, k, t[r] < (1u << 11)))) - { - t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); - } + /* + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + r = r >> 16; + /* Bounds: |r >> 16| <= ceil(|r| / 2^16) + * <= ceil(|a| / 2^16 + MLKEM_Q / 2) + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * + * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) + */ - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 11-bit in size. - */ - r[11 * j + 0] = (t[0] >> 0) & 0xFF; - r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); - r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); - r[11 * j + 3] = (t[2] >> 2) & 0xFF; - r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); - r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); - r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); - r[11 * j + 7] = (t[5] >> 1) & 0xFF; - r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); - r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); - r[11 * j + 10] = (t[7] >> 3); - } + return (int16_t)r; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +/************************************************* + * Name: montgomery_reduce + * + * Description: Montgomery reduction + * + * Arguments: - int32_t a: input integer to be reduced + * Must be smaller than 2 * 2^12 * 2^15 in absolute value. + * + * Returns: integer congruent to a * R^-1 modulo q, + * smaller than 2 * q in absolute value. + **************************************************/ +static INLINE int16_t montgomery_reduce(int32_t a) +__contract__( + requires(a > -(2 * UINT12_LIMIT * 32768)) + requires(a < (2 * UINT12_LIMIT * 32768)) + ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) +) { - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - uint8_t t[8]; - const unsigned offset = i * 5; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC and unwind loop for ease - * of proof. - */ - - /* - * Decompress 5 8-bit bytes (so 40 bits) into - * 8 5-bit values stored in t[] - */ - t[0] = 0x1F & (a[offset + 0] >> 0); - t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); - t[2] = 0x1F & (a[offset + 1] >> 2); - t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); - t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); - t[5] = 0x1F & (a[offset + 3] >> 1); - t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); - t[7] = 0x1F & (a[offset + 4] >> 3); - - /* and copy to the correct slice in r[] */ - for (j = 0; j < 8; j++) - __loop__( - invariant(j <= 8 && i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + int16_t res; + debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); + + res = montgomery_reduce_generic(a); + /* Bounds: + * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 + * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 + * < 2 * MLKEM_Q */ + + debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); + return res; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: fqmul + * + * Description: Montgomery multiplication modulo q=3329 + * + * Arguments: - int16_t a: first factor + * Can be any int16_t. + * - int16_t b: second factor. + * Must be signed canonical (abs value <(q+1)/2) + * + * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and + * smaller than q in absolute value. + * + **************************************************/ +static INLINE int16_t fqmul(int16_t a, int16_t b) +__contract__( + requires(b > -HALF_Q) + requires(b < HALF_Q) + ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) +) { - unsigned j; - for (j = 0; j < MLKEM_N / 8; j++) - __loop__( - invariant(j <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[8]; - uint8_t const *base = &a[11 * j]; - t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); - t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | - ((uint16_t)base[4] << 10)); - t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); - t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); - t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | - ((uint16_t)base[8] << 9)); - t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); - t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); - - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) - { - r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); - } - } + int16_t res; + debug_assert_abs_bound(&b, 1, HALF_Q); + + res = montgomery_reduce((int32_t)a * (int32_t)b); + /* Bounds: + * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 + * < MLKEM_Q + */ - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + debug_assert_abs_bound(&res, 1, MLKEM_Q); + return res; } -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ - -#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: barrett_reduce + * + * Description: Barrett reduction; given a 16-bit integer a, computes + * centered representative congruent to a mod q in + * {-(q-1)/2,...,(q-1)/2} + * + * Arguments: - int16_t a: input integer to be reduced + * + * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. + **************************************************/ +static INLINE int16_t barrett_reduce(int16_t a) +__contract__( + ensures(return_value > -HALF_Q && return_value < HALF_Q) +) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 2; i++) - __loop__(invariant(i <= MLKEM_N / 2)) - { - const uint16_t t0 = a->coeffs[2 * i]; - const uint16_t t1 = a->coeffs[2 * i + 1]; - /* - * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of - * significant data, so these can be packed into 24 bits or exactly - * 3 bytes, as follows. - */ - - /* Least significant bits 0 - 7 of t0. */ - r[3 * i + 0] = t0 & 0xFF; - - /* - * Most significant bits 8 - 11 of t0 become the least significant - * nibble of the second byte. The least significant 4 bits - * of t1 become the upper nibble of the second byte. - */ - r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + /* + * To divide by MLKEM_Q using Barrett multiplication, the "magic number" + * multiplier is round_to_nearest(2**26/MLKEM_Q) + */ + const int BPOWER = 26; + const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - /* Bits 4 - 11 of t1 become the third byte. */ - r[3 * i + 2] = t1 >> 4; - } -} -#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -{ - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - poly_tobytes_native(r, a); -} -#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + /* + * Compute round_to_nearest(a/MLKEM_Q) using the multiplier + * above and shift by BPOWER places. + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; -#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) - { - const uint8_t t0 = a[3 * i + 0]; - const uint8_t t1 = a[3 * i + 1]; - const uint8_t t2 = a[3 * i + 2]; - r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); - r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); - } + /* + * t is in -10 .. +10, so we need 32-bit math to + * evaluate t * MLKEM_Q and the subsequent subtraction + */ + int16_t res = (int16_t)(a - t * MLKEM_Q); - /* Note that the coefficients are not canonical */ - debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); -} -#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - poly_frombytes_native(r, a); + debug_assert_abs_bound(&res, 1, HALF_Q); + return res; } -#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ - -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], + int16_t b_cached) +__contract__( + requires(memory_no_alias(r, 2 * sizeof(int16_t))) + requires(memory_no_alias(a, 2 * sizeof(int16_t))) + requires(memory_no_alias(b, 2 * sizeof(int16_t))) + requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) + assigns(memory_slice(r, 2 * sizeof(int16_t))) + ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))) { - unsigned i; -#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) -#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" -#endif + int32_t t0, t1; + debug_assert_bound(a, 2, 0, UINT12_LIMIT); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - for (j = 0; j < 8; j++) - __loop__( - invariant(i < MLKEM_N / 8 && j <= 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - /* Prevent the compiler from recognizing this as a bit selection */ - uint8_t mask = value_barrier_u8(1u << j); - r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); - } - } - debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); -} + t0 = (int32_t)a[1] * b_cached; + t0 += (int32_t)a[0] * b[0]; + t1 = (int32_t)a[0] * b[1]; + t1 += (int32_t)a[1] * b[0]; -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* |ti| < 2 * q * 2^15 */ + r[0] = montgomery_reduce(t0); + r[1] = montgomery_reduce(t1); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - msg[i] = 0; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8)) - { - uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); - msg[i] |= t << j; - } - } + debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); } MLKEM_NATIVE_INTERNAL_API @@ -434,12 +292,46 @@ void poly_tomont(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_tomont(poly *r) { - poly_tomont_native(r); + poly_tomont_native(r->coeffs); debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) +/************************************************************ + * Name: scalar_signed_to_unsigned_q + * + * Description: converts signed polynomial coefficient + * from signed (-3328 .. 3328) form to + * unsigned form (0 .. 3328). + * + * Note: Cryptographic constant time implementation + * + * Examples: 0 -> 0 + * 1 -> 1 + * 3328 -> 3328 + * -1 -> 3328 + * -2 -> 3327 + * -3328 -> 1 + * + * Arguments: c: signed coefficient to be converted + ************************************************************/ +static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) +__contract__( + requires(c > -MLKEM_Q && c < MLKEM_Q) + ensures(return_value >= 0 && return_value < MLKEM_Q) + ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) +{ + debug_assert_abs_bound(&c, 1, MLKEM_Q); + + /* Add Q if c is negative, but in constant time */ + c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); + + /* and therefore cast to uint16_t is safe. */ + debug_assert_bound(&c, 1, 0, MLKEM_Q); + return (uint16_t)c; +} + MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { @@ -461,7 +353,7 @@ void poly_reduce(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { - poly_reduce_native(r); + poly_reduce_native(r->coeffs); debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ @@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a) MLKEM_NATIVE_INTERNAL_API void poly_mulcache_compute(poly_mulcache *x, const poly *a) { - poly_mulcache_compute_native(x, a); + poly_mulcache_compute_native(x->coeffs, a->coeffs); /* Omitting bounds assertion since native implementations may * decide not to use a mulcache. Note that the C backend implementation * of poly_basemul_montgomery_cached() does still include the check. */ } #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ +#if !defined(MLKEM_USE_NATIVE_NTT) +/* + * Computes a block CT butterflies with a fixed twiddle factor, + * using Montgomery multiplication. + * Parameters: + * - r: Pointer to base of polynomial (_not_ the base of butterfly block) + * - root: Twiddle factor to use for the butterfly. This must be in + * Montgomery form and signed canonical. + * - start: Offset to the beginning of the butterfly block + * - len: Index difference between coefficients subject to a butterfly + * - bound: Ghost variable describing coefficient bound: Prior to `start`, + * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, + * they must be bound by `bound`. + * When this function returns, output coefficients in the index range + * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. + * Example: + * - start=8, len=4 + * This would compute the following four butterflies + * 8 -- 12 + * 9 -- 13 + * 10 -- 14 + * 11 -- 15 + * - start=4, len=2 + * This would compute the following two butterflies + * 4 -- 6 + * 5 -- 7 + */ +static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, + unsigned start, unsigned len, int bound) +__contract__( + requires(start < MLKEM_N) + requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) + requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) + requires(-HALF_Q < zeta && zeta < HALF_Q) + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) + requires(array_abs_bound(r, start, MLKEM_N, bound)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) + ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) +{ + /* `bound` is a ghost variable only needed in the CBMC specification */ + unsigned j; + ((void)bound); + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + /* + * Coefficients are updated in strided pairs, so the bounds for the + * intermediate states alternate twice between the old and new bound + */ + invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j, start + len, bound)) + invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) + { + int16_t t; + t = fqmul(r[j + len], zeta); + r[j + len] = r[j] - t; + r[j] = r[j] + t; + } +} + +/* + *Compute one layer of forward NTT + * Parameters: + * - r: Pointer to base of polynomial + * - len: Stride of butterflies in this layer. + * - layer: Ghost variable indicating which layer is being applied. + * Must match `len` via `len == MLKEM_N >> layer`. + * Note: `len` could be dropped and computed in the function, but + * we are following the structure of the reference NTT from the + * official Kyber implementation here, merely adding `layer` as + * a ghost variable for the specifications. + */ +static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) + requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable only needed in the CBMC specification */ + ((void)layer); + /* Twiddle factors for layer n start at index 2^(layer-1) */ + k = MLKEM_N / (2 * len); + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(start < MLKEM_N + 2 * len) + invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) + invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) + invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) + { + int16_t zeta = zetas[k++]; + ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); + } +} + +/* + * Compute full forward NTT + * NOTE: This particular implementation satisfies a much tighter + * bound on the output coefficients (5*q) than the contractual one (8*q), + * but this is not needed in the calling code. Should we change the + * base multiplication strategy to require smaller NTT output bounds, + * the proof may need strengthening. + */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + unsigned len, layer; + int16_t *r; + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + r = p->coeffs; + + for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) + __loop__( + invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) + invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) + { + ntt_layer(r, len, layer); + } + + /* Check the stronger bound */ + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_NTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + ntt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_NTT */ + +#if !defined(MLKEM_USE_NATIVE_INTT) + +/* Compute one layer of inverse NTT */ +static void invntt_layer(int16_t *r, unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) + requires(len == (1 << (8 - layer))) + requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable used only in the specification */ + ((void)layer); + k = MLKEM_N / len - 1; + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + invariant(start <= MLKEM_N && k <= 127) + /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ + invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) + { + unsigned j; + int16_t zeta = zetas[k--]; + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + invariant(start <= MLKEM_N && k <= 127) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + int16_t t = r[j]; + r[j] = barrett_reduce(t + r[j + len]); + r[j + len] = r[j + len] - t; + r[j + len] = fqmul(r[j + len], zeta); + } + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + /* + * Scale input polynomial to account for Montgomery factor + * and NTT twist. This also brings coefficients down to + * absolute value < MLKEM_Q. + */ + unsigned j, len, layer; + const int16_t f = 1441; + int16_t *r = p->coeffs; + + for (j = 0; j < MLKEM_N; j++) + __loop__( + invariant(j <= MLKEM_N) + invariant(array_abs_bound(r, 0, j, MLKEM_Q))) + { + r[j] = fqmul(r[j], f); + } + + /* Run the invNTT layers */ + for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) + __loop__( + invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + invntt_layer(p->coeffs, len, layer); + } + + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_INTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + intt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_INTT */ + #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h index 6a14c785d..cb0d67c1a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h @@ -9,7 +9,7 @@ #include #include "cbmc.h" #include "common.h" -#include "reduce.h" +#include "debug.h" #include "verify.h" /* Absolute exclusive upper bound for the output of the inverse NTT */ @@ -18,6 +18,9 @@ /* Absolute exclusive upper bound for the output of the forward NTT */ #define NTT_BOUND (8 * MLKEM_Q) +#define zetas MLKEM_NAMESPACE(zetas) +extern const int16_t zetas[128]; + /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1] @@ -38,520 +41,6 @@ typedef struct int16_t coeffs[MLKEM_N >> 1]; } poly_mulcache; -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) -#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) -#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) -#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) -#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) -#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) -#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) -#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) -#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) -#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) -/* End of static namespacing */ - -/************************************************************ - * Name: scalar_compress_d1 - * - * Description: Computes round(u * 2 / q) - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 1. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d1(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 2) - ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) -{ - uint32_t d0 = u << 1; - d0 *= 645083; - d0 += 1u << 30; - d0 >>= 31; - return d0; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_compress_d4 - * - * Description: Computes round(u * 16 / q) % 16 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d4(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 16) - ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) -{ - uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ - return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d4 - * - * Description: Computes round(u * q / 16) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d4(uint32_t u) -__contract__( - requires(0 <= u && u < 16) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 8) / 16; } - -/************************************************************ - * Name: scalar_compress_d5 - * - * Description: Computes round(u * 32 / q) % 32 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d5(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 32) - ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) -{ - uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ - return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d5 - * - * Description: Computes round(u * q / 32) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo 32 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d5(uint32_t u) -__contract__( - requires(0 <= u && u < 32) - ensures(return_value <= MLKEM_Q - 1) -) { return ((u * MLKEM_Q) + 16) / 32; } - -/************************************************************ - * Name: scalar_compress_d10 - * - * Description: Computes round(u * 2**10 / q) % 2**10 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d10(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 10)) - ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) -{ - uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x3FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d10 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d10(uint32_t u) -__contract__( - requires(0 <= u && u < 1024) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 512) / 1024; } - -/************************************************************ - * Name: scalar_compress_d11 - * - * Description: Computes round(u * 2**11 / q) % 2**11 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 11. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d11(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 11)) - ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) -{ - uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x7FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d11 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d11(uint32_t u) -__contract__( - requires(0 <= u && u < 2048) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 1024) / 2048; } - -/************************************************************ - * Name: scalar_signed_to_unsigned_q - * - * Description: converts signed polynomial coefficient - * from signed (-3328 .. 3328) form to - * unsigned form (0 .. 3328). - * - * Note: Cryptographic constant time implementation - * - * Examples: 0 -> 0 - * 1 -> 1 - * 3328 -> 3328 - * -1 -> 3328 - * -2 -> 3327 - * -3328 -> 1 - * - * Arguments: c: signed coefficient to be converted - ************************************************************/ -static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) -__contract__( - requires(c > -MLKEM_Q && c < MLKEM_Q) - ensures(return_value >= 0 && return_value < MLKEM_Q) - ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) -{ - debug_assert_abs_bound(&c, 1, MLKEM_Q); - - /* Add Q if c is negative, but in constant time */ - c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); - - /* and therefore cast to uint16_t is safe. */ - debug_assert_bound(&c, 1, 0, MLKEM_Q); - return (uint16_t)c; -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ - (MLKEM_K == 2 || MLKEM_K == 3) -#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) -/************************************************* - * Name: poly_compress_d4 - * - * Description: Compression (4 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); - -#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) -/************************************************* - * Name: poly_compress_d10 - * - * Description: Compression (10 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); - -#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) -/************************************************* - * Name: poly_decompress_d4 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); - -#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) -/************************************************* - * Name: poly_decompress_d10 - * - * Description: De-serialization and subsequent decompression (10 bits) of a - * polynomial; approximate inverse of poly_compress_d10 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) -/************************************************* - * Name: poly_compress_d5 - * - * Description: Compression (5 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); - -#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) -/************************************************* - * Name: poly_compress_d11 - * - * Description: Compression (11 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); - -#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) -/************************************************* - * Name: poly_decompress_d5 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); - -#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) -/************************************************* - * Name: poly_decompress_d11 - * - * Description: De-serialization and subsequent decompression (11 bits) of a - * polynomial; approximate inverse of poly_compress_d11 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ - */ - -#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) -/************************************************* - * Name: poly_tobytes - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - a: const pointer to input polynomial, - * with each coefficient in the range [0,1,..,Q-1] - * OUTPUT - * - r: pointer to output byte array - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYBYTES)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(r)) -); - - -#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) -/************************************************* - * Name: poly_frombytes - * - * Description: De-serialization of a polynomial. - * - * Arguments: INPUT - * - a: pointer to input byte array - * (of MLKEM_POLYBYTES bytes) - * OUTPUT - * - r: pointer to output polynomial, with - * each coefficient unsigned and in the range - * 0 .. 4095 - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) -); - - -#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) -/************************************************* - * Name: poly_frommsg - * - * Description: Convert 32-byte message to polynomial - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *msg: pointer to input message - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(object_whole(r)) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) -); - -#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) -/************************************************* - * Name: poly_tomsg - * - * Description: Convert polynomial to 32-byte message - * - * Arguments: - uint8_t *msg: pointer to output message - * - const poly *r: pointer to input polynomial - * Coefficients must be unsigned canonical - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(msg)) -); - #define poly_basemul_montgomery_cached \ MLKEM_NAMESPACE(poly_basemul_montgomery_cached) /************************************************* @@ -715,4 +204,56 @@ __contract__( assigns(object_whole(r)) ); +#define poly_ntt MLKEM_NAMESPACE(poly_ntt) +/************************************************* + * Name: poly_ntt + * + * Description: Computes negacyclic number-theoretic transform (NTT) of + * a polynomial in place. + * + * The input is assumed to be in normal order and + * coefficient-wise bound by MLKEM_Q in absolute value. + * + * The output polynomial is in bitreversed order, and + * coefficient-wise bound by NTT_BOUND in absolute value. + * + * (NOTE: Sometimes the input to the NTT is actually smaller, + * which gives better bounds.) + * + * Arguments: - poly *p: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) +); + +#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) +/************************************************* + * Name: poly_invntt_tomont + * + * Description: Computes inverse of negacyclic number-theoretic transform (NTT) + * of a polynomial in place; + * inputs assumed to be in bitreversed order, output in normal + * order + * + * The input is assumed to be in bitreversed order, and can + * have arbitrary coefficients in int16_t. + * + * The output polynomial is in normal order, and + * coefficient-wise bound by INVNTT_BOUND in absolute value. + * + * Arguments: - uint16_t *a: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) +); + #endif /* POLY_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly_k.c new file mode 100644 index 000000000..c2d330ea9 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly_k.c @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "poly_k.h" +#include +#include +#include "arith_backend.h" +#include "compress.h" +#include "sampling.h" +#include "symmetric.h" + +#include "debug.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1) +#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2) +/* End of static namespacing */ + +MLKEM_NATIVE_INTERNAL_API +void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], + const polyvec *a) +{ + unsigned i; + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_K; i++) + { + poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_decompress_du(polyvec *r, + const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU); + } + + debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) +{ + unsigned i; + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_K; i++) + { + poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES); + } + + debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_ntt(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_ntt(&r->vec[i]); + } + + debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_invntt_tomont(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_invntt_tomont(&r->vec[i]); + } + + debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND); +} + +#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, + const polyvec *b, + const polyvec_mulcache *b_cache) +{ + unsigned i; + poly t; + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); + + poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]); + for (i = 1; i < MLKEM_K; i++) + { + poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i], + &b_cache->vec[i]); + poly_add(r, &t); + } + + /* + * This bound is true for the C implementation, but not needed + * in the higher level bounds reasoning. It is thus omitted + * them from the spec to not unnecessarily constrain native + * implementations, but checked here nonetheless. + */ + debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q); +} +#else /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, + const polyvec *b, + const polyvec_mulcache *b_cache) +{ + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); + /* Omitting bounds assertion for cache since native implementations may + * decide not to use a mulcache. Note that the C backend implementation + * of poly_basemul_montgomery_cached() does still include the check. */ + polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a, + (const int16_t *)b, + (const int16_t *)b_cache); +} +#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ + +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) +{ + polyvec_mulcache b_cache; + polyvec_mulcache_compute(&b_cache, b); + polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_mulcache_compute(&x->vec[i], &a->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_reduce(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_reduce(&r->vec[i]); + } + + debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_add(polyvec *r, const polyvec *b) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_add(&r->vec[i], &b->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_tomont(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_tomont(&r->vec[i]); + } + + debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q); +} + + +/************************************************* + * Name: poly_cbd_eta1 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter MLKEM_ETA1. + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +static INLINE void poly_cbd_eta1(poly *r, + const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4]) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1)) +) +{ +#if MLKEM_ETA1 == 2 + poly_cbd2(r, buf); +#elif MLKEM_ETA1 == 3 + poly_cbd3(r, buf); +#else +#error "Invalid value of MLKEM_ETA1" +#endif +} + +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, + uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) +{ + ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1]; + ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1]; + ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1]; + ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1]; + memcpy(extkey0, seed, MLKEM_SYMBYTES); + memcpy(extkey1, seed, MLKEM_SYMBYTES); + memcpy(extkey2, seed, MLKEM_SYMBYTES); + memcpy(extkey3, seed, MLKEM_SYMBYTES); + extkey0[MLKEM_SYMBYTES] = nonce0; + extkey1[MLKEM_SYMBYTES] = nonce1; + extkey2[MLKEM_SYMBYTES] = nonce2; + extkey3[MLKEM_SYMBYTES] = nonce3; + prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3); + poly_cbd_eta1(r0, buf0); + poly_cbd_eta1(r1, buf1); + poly_cbd_eta1(r2, buf2); + poly_cbd_eta1(r3, buf3); + + debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1); +} + +#if MLKEM_K == 2 || MLKEM_K == 4 +/************************************************* + * Name: poly_cbd_eta2 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter MLKEM_ETA2. + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +static INLINE void poly_cbd_eta2(poly *r, + const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))) +{ +#if MLKEM_ETA2 == 2 + poly_cbd2(r, buf); +#else +#error "Invalid value of MLKEM_ETA2" +#endif +} + +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce) +{ + ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]; + ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1]; + + memcpy(extkey, seed, MLKEM_SYMBYTES); + extkey[MLKEM_SYMBYTES] = nonce; + prf_eta2(buf, extkey); + + poly_cbd_eta2(r, buf); + + debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1); +} +#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ + + +#if MLKEM_K == 2 +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, + uint8_t nonce3) +{ + ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4]; + ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1]; + memcpy(extkey[0], seed, MLKEM_SYMBYTES); + memcpy(extkey[1], seed, MLKEM_SYMBYTES); + memcpy(extkey[2], seed, MLKEM_SYMBYTES); + memcpy(extkey[3], seed, MLKEM_SYMBYTES); + extkey[0][MLKEM_SYMBYTES] = nonce0; + extkey[1][MLKEM_SYMBYTES] = nonce1; + extkey[2][MLKEM_SYMBYTES] = nonce2; + extkey[3][MLKEM_SYMBYTES] = nonce3; + + prf_eta1(buf1[0], extkey[0]); + prf_eta1(buf1[1], extkey[1]); + prf_eta2(buf2[0], extkey[2]); + prf_eta2(buf2[1], extkey[3]); + + poly_cbd_eta1(r0, buf1[0]); + poly_cbd_eta1(r1, buf1[1]); + poly_cbd_eta2(r2, buf2[0]); + poly_cbd_eta2(r3, buf2[1]); + + debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1); + debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1); +} +#endif /* MLKEM_K == 2 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly_k.h new file mode 100644 index 000000000..0aea95912 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly_k.h @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef POLY_K_H +#define POLY_K_H + +#include +#include "common.h" +#include "compress.h" +#include "poly.h" + +#define polyvec MLKEM_NAMESPACE_K(polyvec) +typedef struct +{ + poly vec[MLKEM_K]; +} ALIGN polyvec; + +#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache) +typedef struct +{ + poly_mulcache vec[MLKEM_K]; +} polyvec_mulcache; + +#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du) +/************************************************* + * Name: poly_compress_du + * + * Description: Compression (du bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], + const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))) +{ +#if MLKEM_DU == 10 + poly_compress_d10(r, a); +#elif MLKEM_DU == 11 + poly_compress_d11(r, a); +#else +#error "Invalid value of MLKEM_DU" +#endif +} + +#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du) +/************************************************* + * Name: poly_decompress_du + * + * Description: De-serialization and subsequent decompression (du bits) of a + * polynomial; approximate inverse of poly_compress_du + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +static INLINE void poly_decompress_du( + poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +{ +#if MLKEM_DU == 10 + poly_decompress_d10(r, a); +#elif MLKEM_DU == 11 + poly_decompress_d11(r, a); +#else +#error "Invalid value of MLKEM_DU" +#endif +} + +#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv) +/************************************************* + * Name: poly_compress_dv + * + * Description: Compression (dv bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], + const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(r))) +{ +#if MLKEM_DV == 4 + poly_compress_d4(r, a); +#elif MLKEM_DV == 5 + poly_compress_d5(r, a); +#else +#error "Invalid value of MLKEM_DV" +#endif +} + + +#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv) +/************************************************* + * Name: poly_decompress_dv + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +static INLINE void poly_decompress_dv( + poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(object_whole(r)) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +{ +#if MLKEM_DV == 4 + poly_decompress_d4(r, a); +#elif MLKEM_DV == 5 + poly_decompress_d5(r, a); +#else +#error "Invalid value of MLKEM_DV" +#endif +} + +#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du) +/************************************************* + * Name: polyvec_compress_du + * + * Description: Compress and serialize vector of polynomials + * + * Arguments: - uint8_t *r: pointer to output byte array + * (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU) + * - const polyvec *a: pointer to input vector of polynomials. + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], + const polyvec *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(a, sizeof(polyvec))) + requires(forall(k0, 0, MLKEM_K, + array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) + assigns(object_whole(r)) +); + +#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du) +/************************************************* + * Name: polyvec_decompress_du + * + * Description: De-serialize and decompress vector of polynomials; + * approximate inverse of polyvec_compress_du + * + * Arguments: - polyvec *r: pointer to output vector of polynomials. + * Output will have coefficients normalized to [0,..,q-1]. + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU) + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_decompress_du(polyvec *r, + const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(k0, 0, MLKEM_K, + array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +); + +#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes) +/************************************************* + * Name: polyvec_tobytes + * + * Description: Serialize vector of polynomials + * + * Arguments: - uint8_t *r: pointer to output byte array + * (needs space for MLKEM_POLYVECBYTES) + * - const polyvec *a: pointer to input vector of polynomials + * Each polynomial must have coefficients in [0,..,q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) +__contract__( + requires(memory_no_alias(a, sizeof(polyvec))) + requires(memory_no_alias(r, MLKEM_POLYVECBYTES)) + requires(forall(k0, 0, MLKEM_K, + array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) + assigns(object_whole(r)) +); + +#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes) +/************************************************* + * Name: polyvec_frombytes + * + * Description: De-serialize vector of polynomials; + * inverse of polyvec_tobytes + * + * Arguments: - const polyvec *a: pointer to output vector of polynomials + * (of length MLKEM_POLYVECBYTES). Output will have coefficients + * normalized in [0..4095]. + * - uint8_t *r: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + requires(memory_no_alias(a, MLKEM_POLYVECBYTES)) + assigns(object_whole(r)) + ensures(forall(k0, 0, MLKEM_K, + array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) +); + +#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt) +/************************************************* + * Name: polyvec_ntt + * + * Description: Apply forward NTT to all elements of a vector of polynomials. + * + * The input is assumed to be in normal order and + * coefficient-wise bound by MLKEM_Q in absolute value. + * + * The output polynomial is in bitreversed order, and + * coefficient-wise bound by NTT_BOUND in absolute value. + * + * Arguments: - polyvec *r: pointer to in/output vector of polynomials + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_ntt(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + requires(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) + assigns(object_whole(r)) + ensures(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND))) +); + +#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont) +/************************************************* + * Name: polyvec_invntt_tomont + * + * Description: Apply inverse NTT to all elements of a vector of polynomials + * and multiply by Montgomery factor 2^16 + * + * The input is assumed to be in bitreversed order, and can + * have arbitrary coefficients in int16_t. + * + * The output polynomial is in normal order, and + * coefficient-wise bound by INVNTT_BOUND in absolute value. + * + * + * Arguments: - polyvec *r: pointer to in/output vector of polynomials + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_invntt_tomont(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND))) +); + +#define polyvec_basemul_acc_montgomery \ + MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery) +/************************************************* + * Name: polyvec_basemul_acc_montgomery + * + * Description: Multiply elements of a and b in NTT domain, accumulate into r, + * and multiply by 2^-16. + * + * Arguments: - poly *r: pointer to output polynomial + * - const polyvec *a: pointer to first input vector of polynomials + * - const polyvec *b: pointer to second input vector of polynomials + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(a, sizeof(polyvec))) + requires(memory_no_alias(b, sizeof(polyvec))) + requires(forall(k1, 0, MLKEM_K, + array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) + assigns(memory_slice(r, sizeof(poly))) +); + + +#define polyvec_basemul_acc_montgomery_cached \ + MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached) +/************************************************* + * Name: polyvec_basemul_acc_montgomery_cached + * + * Description: Scalar product of two vectors of polynomials in NTT domain, + * using mulcache for second operand. + * + * Bounds: + * - Every coefficient of a is assumed to be in [0..4095] + * - No bounds guarantees for the coefficients in the result. + * + * Arguments: - poly *r: pointer to output polynomial + * - const polyvec *a: pointer to first input polynomial vector + * - const polyvec *b: pointer to second input polynomial vector + * - const polyvec_mulcache *b_cache: pointer to mulcache + * for second input polynomial vector. Can be computed + * via polyvec_mulcache_compute(). + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, + const polyvec *b, + const polyvec_mulcache *b_cache) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(a, sizeof(polyvec))) + requires(memory_no_alias(b, sizeof(polyvec))) + requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache))) + requires(forall(k1, 0, MLKEM_K, + array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) + assigns(memory_slice(r, sizeof(poly))) +); + +#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute) +/************************************************************ + * Name: polyvec_mulcache_compute + * + * Description: Computes the mulcache for a vector of polynomials in NTT domain + * + * The mulcache of a degree-2 polynomial b := b0 + b1*X + * in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when + * computing products of b in Fq[X]/(X^2-zeta). + * + * The mulcache of a polynomial in NTT domain -- which is + * a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta), + * for varying zeta, is the 128-tuple of mulcaches of those + * polynomials. + * + * The mulcache of a vector of polynomials is the vector + * of mulcaches of its entries. + * + * Arguments: - x: Pointer to mulcache to be populated + * - a: Pointer to input polynomial vector + ************************************************************/ +/* + * NOTE: The default C implementation of this function populates + * the mulcache with values in (-q,q), but this is not needed for the + * higher level safety proofs, and thus not part of the spec. + */ +MLKEM_NATIVE_INTERNAL_API +void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) +__contract__( + requires(memory_no_alias(x, sizeof(polyvec_mulcache))) + requires(memory_no_alias(a, sizeof(polyvec))) + assigns(object_whole(x)) +); + +#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce) +/************************************************* + * Name: polyvec_reduce + * + * Description: Applies Barrett reduction to each coefficient + * of each element of a vector of polynomials; + * for details of the Barrett reduction see comments in reduce.c + * + * Arguments: - polyvec *r: pointer to input/output polynomial + **************************************************/ +/* + * NOTE: The semantics of polyvec_reduce() is different in + * the reference implementation, which requires + * signed canonical output data. Unsigned canonical + * outputs are better suited to the only remaining + * use of poly_reduce() in the context of (de)serialization. + */ +MLKEM_NATIVE_INTERNAL_API +void polyvec_reduce(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(k0, 0, MLKEM_K, + array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +); + +#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add) +/************************************************* + * Name: polyvec_add + * + * Description: Add vectors of polynomials + * + * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be + * added to + * - const polyvec *b: pointer to second input vector of polynomials + * + * The coefficients of r and b must be so that the addition does + * not overflow. Otherwise, the behaviour of this function is undefined. + * + * The coefficients returned in *r are in int16_t which is sufficient + * to prove type-safety of calling units. Therefore, no stronger + * ensures clause is required on this function. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_add(polyvec *r, const polyvec *b) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + requires(memory_no_alias(b, sizeof(polyvec))) + requires(forall(j0, 0, MLKEM_K, + forall(k0, 0, MLKEM_N, + (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX))) + requires(forall(j1, 0, MLKEM_K, + forall(k1, 0, MLKEM_N, + (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN))) + assigns(object_whole(r)) +); + +#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont) +/************************************************* + * Name: polyvec_tomont + * + * Description: Inplace conversion of all coefficients of a polynomial + * vector from normal domain to Montgomery domain + * + * Bounds: Output < q in absolute value. + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_tomont(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(memory_slice(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) +); + +#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x) +/************************************************* + * Name: poly_getnoise_eta1_4x + * + * Description: Batch sample four polynomials deterministically from a seed + * and nonces, with output polynomials close to centered binomial distribution + * with parameter MLKEM_ETA1. + * + * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial + * - const uint8_t *seed: pointer to input seed + * (of length MLKEM_SYMBYTES bytes) + * - uint8_t nonce{0,1,2,3}: one-byte input nonce + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, + uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) +/* Depending on MLKEM_K, the pointers passed to this function belong + to the same objects, so we cannot use memory_no_alias for r0-r3. + + NOTE: Somehow it is important to use memory_no_alias() first in the + conjunctions defining each case. +*/ +#if MLKEM_K == 2 +__contract__( + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */ + (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && + r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) + assigns(memory_slice(r0, sizeof(poly))) + assigns(memory_slice(r1, sizeof(poly))) + assigns(memory_slice(r2, sizeof(poly))) + assigns(memory_slice(r3, sizeof(poly))) + ensures( + array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); +); +#elif MLKEM_K == 4 +__contract__( + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + requires( /* Case B: r0, r1, r2, r3 consecutive */ + (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3)) + assigns(memory_slice(r0, sizeof(poly))) + assigns(memory_slice(r1, sizeof(poly))) + assigns(memory_slice(r2, sizeof(poly))) + assigns(memory_slice(r3, sizeof(poly))) + ensures( + array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); +); +#elif MLKEM_K == 3 +__contract__( + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + requires( /* Case C: r0, r1, r2 consecutive */ + (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) && + r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0))) + assigns(memory_slice(r0, sizeof(poly))) + assigns(memory_slice(r1, sizeof(poly))) + assigns(memory_slice(r2, sizeof(poly))) + assigns(memory_slice(r3, sizeof(poly))) + ensures( + array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); +); +#endif /* MLKEM_K */ + +#if MLKEM_ETA1 == MLKEM_ETA2 +/* + * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024 + * where MLKEM_ETA2 = MLKEM_ETA1 = 2. + * For ml-kem-512, poly_getnoise_eta1122_4x is used instead. + */ +#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x +#endif /* MLKEM_ETA1 == MLKEM_ETA2 */ + +#if MLKEM_K == 2 || MLKEM_K == 4 +#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2) +/************************************************* + * Name: poly_getnoise_eta2 + * + * Description: Sample a polynomial deterministically from a seed and a nonce, + * with output polynomial close to centered binomial distribution + * with parameter MLKEM_ETA2 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *seed: pointer to input seed + * (of length MLKEM_SYMBYTES bytes) + * - uint8_t nonce: one-byte input nonce + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + assigns(object_whole(r)) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)) +); +#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ + +#if MLKEM_K == 2 +#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x) +/************************************************* + * Name: poly_getnoise_eta1122_4x + * + * Description: Batch sample four polynomials deterministically from a seed + * and a nonces, with output polynomials close to centered binomial + * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2 + * + * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial + * - const uint8_t *seed: pointer to input seed + * (of length MLKEM_SYMBYTES bytes) + * - uint8_t nonce{0,1,2,3}: one-byte input nonce + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, + uint8_t nonce3) +__contract__( + requires( /* r0, r1 consecutive, r2, r3 consecutive */ + (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && + r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3)) + ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)); +); +#endif /* MLKEM_K == 2 */ + +#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.c deleted file mode 100644 index 50ea1c34a..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.c +++ /dev/null @@ -1,330 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "polyvec.h" -#include -#include -#include "arith_backend.h" -#include "cbd.h" -#include "ntt.h" -#include "poly.h" -#include "symmetric.h" - -#include "debug.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1) -#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2) -/* End of static namespacing */ - -MLKEM_NATIVE_INTERNAL_API -void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], - const polyvec *a) -{ - unsigned i; - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_K; i++) - { - poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_decompress_du(polyvec *r, - const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU); - } - - debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) -{ - unsigned i; - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_K; i++) - { - poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES); - } - - debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_ntt(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_ntt(&r->vec[i]); - } - - debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_invntt_tomont(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_invntt_tomont(&r->vec[i]); - } - - debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND); -} - -#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) -{ - unsigned i; - poly t; - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); - - poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]); - for (i = 1; i < MLKEM_K; i++) - { - poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i], - &b_cache->vec[i]); - poly_add(r, &t); - } - - /* - * This bound is true for the C implementation, but not needed - * in the higher level bounds reasoning. It is thus omitted - * them from the spec to not unnecessarily constrain native - * implementations, but checked here nonetheless. - */ - debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q); -} -#else /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) -{ - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); - /* Omitting bounds assertion for cache since native implementations may - * decide not to use a mulcache. Note that the C backend implementation - * of poly_basemul_montgomery_cached() does still include the check. */ - polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache); -} -#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ - -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) -{ - polyvec_mulcache b_cache; - polyvec_mulcache_compute(&b_cache, b); - polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_mulcache_compute(&x->vec[i], &a->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_reduce(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_reduce(&r->vec[i]); - } - - debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_add(polyvec *r, const polyvec *b) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_add(&r->vec[i], &b->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_tomont(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_tomont(&r->vec[i]); - } - - debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q); -} - - -/************************************************* - * Name: poly_cbd_eta1 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter MLKEM_ETA1. - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -static INLINE void poly_cbd_eta1(poly *r, - const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4]) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1)) -) -{ -#if MLKEM_ETA1 == 2 - poly_cbd2(r, buf); -#elif MLKEM_ETA1 == 3 - poly_cbd3(r, buf); -#else -#error "Invalid value of MLKEM_ETA1" -#endif -} - -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, - uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) -{ - ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1]; - ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1]; - ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1]; - ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1]; - memcpy(extkey0, seed, MLKEM_SYMBYTES); - memcpy(extkey1, seed, MLKEM_SYMBYTES); - memcpy(extkey2, seed, MLKEM_SYMBYTES); - memcpy(extkey3, seed, MLKEM_SYMBYTES); - extkey0[MLKEM_SYMBYTES] = nonce0; - extkey1[MLKEM_SYMBYTES] = nonce1; - extkey2[MLKEM_SYMBYTES] = nonce2; - extkey3[MLKEM_SYMBYTES] = nonce3; - prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3); - poly_cbd_eta1(r0, buf0); - poly_cbd_eta1(r1, buf1); - poly_cbd_eta1(r2, buf2); - poly_cbd_eta1(r3, buf3); - - debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1); -} - -#if MLKEM_K == 2 || MLKEM_K == 4 -/************************************************* - * Name: poly_cbd_eta2 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter MLKEM_ETA2. - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -static INLINE void poly_cbd_eta2(poly *r, - const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))) -{ -#if MLKEM_ETA2 == 2 - poly_cbd2(r, buf); -#else -#error "Invalid value of MLKEM_ETA2" -#endif -} - -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce) -{ - ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]; - ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1]; - - memcpy(extkey, seed, MLKEM_SYMBYTES); - extkey[MLKEM_SYMBYTES] = nonce; - prf_eta2(buf, extkey); - - poly_cbd_eta2(r, buf); - - debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1); -} -#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ - - -#if MLKEM_K == 2 -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, - uint8_t nonce3) -{ - ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4]; - ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1]; - memcpy(extkey[0], seed, MLKEM_SYMBYTES); - memcpy(extkey[1], seed, MLKEM_SYMBYTES); - memcpy(extkey[2], seed, MLKEM_SYMBYTES); - memcpy(extkey[3], seed, MLKEM_SYMBYTES); - extkey[0][MLKEM_SYMBYTES] = nonce0; - extkey[1][MLKEM_SYMBYTES] = nonce1; - extkey[2][MLKEM_SYMBYTES] = nonce2; - extkey[3][MLKEM_SYMBYTES] = nonce3; - - prf_eta1(buf1[0], extkey[0]); - prf_eta1(buf1[1], extkey[1]); - prf_eta2(buf2[0], extkey[2]); - prf_eta2(buf2[1], extkey[3]); - - poly_cbd_eta1(r0, buf1[0]); - poly_cbd_eta1(r1, buf1[1]); - poly_cbd_eta2(r2, buf2[0]); - poly_cbd_eta2(r3, buf2[1]); - - debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1); - debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1); -} -#endif /* MLKEM_K == 2 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.h deleted file mode 100644 index 8be8579e0..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.h +++ /dev/null @@ -1,595 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef POLYVEC_H -#define POLYVEC_H - -#include -#include "common.h" -#include "poly.h" - -#define polyvec MLKEM_NAMESPACE_K(polyvec) -typedef struct -{ - poly vec[MLKEM_K]; -} ALIGN polyvec; - -#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache) -typedef struct -{ - poly_mulcache vec[MLKEM_K]; -} polyvec_mulcache; - -#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du) -/************************************************* - * Name: poly_compress_du - * - * Description: Compression (du bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], - const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))) -{ -#if MLKEM_DU == 10 - poly_compress_d10(r, a); -#elif MLKEM_DU == 11 - poly_compress_d11(r, a); -#else -#error "Invalid value of MLKEM_DU" -#endif -} - -#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du) -/************************************************* - * Name: poly_decompress_du - * - * Description: De-serialization and subsequent decompression (du bits) of a - * polynomial; approximate inverse of poly_compress_du - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -static INLINE void poly_decompress_du( - poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -{ -#if MLKEM_DU == 10 - poly_decompress_d10(r, a); -#elif MLKEM_DU == 11 - poly_decompress_d11(r, a); -#else -#error "Invalid value of MLKEM_DU" -#endif -} - -#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv) -/************************************************* - * Name: poly_compress_dv - * - * Description: Compression (dv bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], - const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(r))) -{ -#if MLKEM_DV == 4 - poly_compress_d4(r, a); -#elif MLKEM_DV == 5 - poly_compress_d5(r, a); -#else -#error "Invalid value of MLKEM_DV" -#endif -} - - -#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv) -/************************************************* - * Name: poly_decompress_dv - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -static INLINE void poly_decompress_dv( - poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(object_whole(r)) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -{ -#if MLKEM_DV == 4 - poly_decompress_d4(r, a); -#elif MLKEM_DV == 5 - poly_decompress_d5(r, a); -#else -#error "Invalid value of MLKEM_DV" -#endif -} - -#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du) -/************************************************* - * Name: polyvec_compress_du - * - * Description: Compress and serialize vector of polynomials - * - * Arguments: - uint8_t *r: pointer to output byte array - * (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU) - * - const polyvec *a: pointer to input vector of polynomials. - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], - const polyvec *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(a, sizeof(polyvec))) - requires(forall(k0, 0, MLKEM_K, - array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) - assigns(object_whole(r)) -); - -#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du) -/************************************************* - * Name: polyvec_decompress_du - * - * Description: De-serialize and decompress vector of polynomials; - * approximate inverse of polyvec_compress_du - * - * Arguments: - polyvec *r: pointer to output vector of polynomials. - * Output will have coefficients normalized to [0,..,q-1]. - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU) - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_decompress_du(polyvec *r, - const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(k0, 0, MLKEM_K, - array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -); - -#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes) -/************************************************* - * Name: polyvec_tobytes - * - * Description: Serialize vector of polynomials - * - * Arguments: - uint8_t *r: pointer to output byte array - * (needs space for MLKEM_POLYVECBYTES) - * - const polyvec *a: pointer to input vector of polynomials - * Each polynomial must have coefficients in [0,..,q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) -__contract__( - requires(memory_no_alias(a, sizeof(polyvec))) - requires(memory_no_alias(r, MLKEM_POLYVECBYTES)) - requires(forall(k0, 0, MLKEM_K, - array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) - assigns(object_whole(r)) -); - -#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes) -/************************************************* - * Name: polyvec_frombytes - * - * Description: De-serialize vector of polynomials; - * inverse of polyvec_tobytes - * - * Arguments: - const polyvec *a: pointer to output vector of polynomials - * (of length MLKEM_POLYVECBYTES). Output will have coefficients - * normalized in [0..4095]. - * - uint8_t *r: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - requires(memory_no_alias(a, MLKEM_POLYVECBYTES)) - assigns(object_whole(r)) - ensures(forall(k0, 0, MLKEM_K, - array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) -); - -#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt) -/************************************************* - * Name: polyvec_ntt - * - * Description: Apply forward NTT to all elements of a vector of polynomials. - * - * The input is assumed to be in normal order and - * coefficient-wise bound by MLKEM_Q in absolute value. - * - * The output polynomial is in bitreversed order, and - * coefficient-wise bound by NTT_BOUND in absolute value. - * - * Arguments: - polyvec *r: pointer to in/output vector of polynomials - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_ntt(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - requires(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) - assigns(object_whole(r)) - ensures(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND))) -); - -#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont) -/************************************************* - * Name: polyvec_invntt_tomont - * - * Description: Apply inverse NTT to all elements of a vector of polynomials - * and multiply by Montgomery factor 2^16 - * - * The input is assumed to be in bitreversed order, and can - * have arbitrary coefficients in int16_t. - * - * The output polynomial is in normal order, and - * coefficient-wise bound by INVNTT_BOUND in absolute value. - * - * - * Arguments: - polyvec *r: pointer to in/output vector of polynomials - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_invntt_tomont(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND))) -); - -#define polyvec_basemul_acc_montgomery \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery) -/************************************************* - * Name: polyvec_basemul_acc_montgomery - * - * Description: Multiply elements of a and b in NTT domain, accumulate into r, - * and multiply by 2^-16. - * - * Arguments: - poly *r: pointer to output polynomial - * - const polyvec *a: pointer to first input vector of polynomials - * - const polyvec *b: pointer to second input vector of polynomials - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(a, sizeof(polyvec))) - requires(memory_no_alias(b, sizeof(polyvec))) - requires(forall(k1, 0, MLKEM_K, - array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) - assigns(memory_slice(r, sizeof(poly))) -); - - -#define polyvec_basemul_acc_montgomery_cached \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached) -/************************************************* - * Name: polyvec_basemul_acc_montgomery_cached - * - * Description: Scalar product of two vectors of polynomials in NTT domain, - * using mulcache for second operand. - * - * Bounds: - * - Every coefficient of a is assumed to be in [0..4095] - * - No bounds guarantees for the coefficients in the result. - * - * Arguments: - poly *r: pointer to output polynomial - * - const polyvec *a: pointer to first input polynomial vector - * - const polyvec *b: pointer to second input polynomial vector - * - const polyvec_mulcache *b_cache: pointer to mulcache - * for second input polynomial vector. Can be computed - * via polyvec_mulcache_compute(). - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(a, sizeof(polyvec))) - requires(memory_no_alias(b, sizeof(polyvec))) - requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache))) - requires(forall(k1, 0, MLKEM_K, - array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) - assigns(memory_slice(r, sizeof(poly))) -); - -#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute) -/************************************************************ - * Name: polyvec_mulcache_compute - * - * Description: Computes the mulcache for a vector of polynomials in NTT domain - * - * The mulcache of a degree-2 polynomial b := b0 + b1*X - * in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when - * computing products of b in Fq[X]/(X^2-zeta). - * - * The mulcache of a polynomial in NTT domain -- which is - * a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta), - * for varying zeta, is the 128-tuple of mulcaches of those - * polynomials. - * - * The mulcache of a vector of polynomials is the vector - * of mulcaches of its entries. - * - * Arguments: - x: Pointer to mulcache to be populated - * - a: Pointer to input polynomial vector - ************************************************************/ -/* - * NOTE: The default C implementation of this function populates - * the mulcache with values in (-q,q), but this is not needed for the - * higher level safety proofs, and thus not part of the spec. - */ -MLKEM_NATIVE_INTERNAL_API -void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) -__contract__( - requires(memory_no_alias(x, sizeof(polyvec_mulcache))) - requires(memory_no_alias(a, sizeof(polyvec))) - assigns(object_whole(x)) -); - -#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce) -/************************************************* - * Name: polyvec_reduce - * - * Description: Applies Barrett reduction to each coefficient - * of each element of a vector of polynomials; - * for details of the Barrett reduction see comments in reduce.c - * - * Arguments: - polyvec *r: pointer to input/output polynomial - **************************************************/ -/* - * NOTE: The semantics of polyvec_reduce() is different in - * the reference implementation, which requires - * signed canonical output data. Unsigned canonical - * outputs are better suited to the only remaining - * use of poly_reduce() in the context of (de)serialization. - */ -MLKEM_NATIVE_INTERNAL_API -void polyvec_reduce(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(k0, 0, MLKEM_K, - array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -); - -#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add) -/************************************************* - * Name: polyvec_add - * - * Description: Add vectors of polynomials - * - * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be - * added to - * - const polyvec *b: pointer to second input vector of polynomials - * - * The coefficients of r and b must be so that the addition does - * not overflow. Otherwise, the behaviour of this function is undefined. - * - * The coefficients returned in *r are in int16_t which is sufficient - * to prove type-safety of calling units. Therefore, no stronger - * ensures clause is required on this function. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_add(polyvec *r, const polyvec *b) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - requires(memory_no_alias(b, sizeof(polyvec))) - requires(forall(j0, 0, MLKEM_K, - forall(k0, 0, MLKEM_N, - (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX))) - requires(forall(j1, 0, MLKEM_K, - forall(k1, 0, MLKEM_N, - (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN))) - assigns(object_whole(r)) -); - -#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont) -/************************************************* - * Name: polyvec_tomont - * - * Description: Inplace conversion of all coefficients of a polynomial - * vector from normal domain to Montgomery domain - * - * Bounds: Output < q in absolute value. - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_tomont(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(memory_slice(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) -); - -#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x) -/************************************************* - * Name: poly_getnoise_eta1_4x - * - * Description: Batch sample four polynomials deterministically from a seed - * and nonces, with output polynomials close to centered binomial distribution - * with parameter MLKEM_ETA1. - * - * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial - * - const uint8_t *seed: pointer to input seed - * (of length MLKEM_SYMBYTES bytes) - * - uint8_t nonce{0,1,2,3}: one-byte input nonce - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, - uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) -/* Depending on MLKEM_K, the pointers passed to this function belong - to the same objects, so we cannot use memory_no_alias for r0-r3. - - NOTE: Somehow it is important to use memory_no_alias() first in the - conjunctions defining each case. -*/ -#if MLKEM_K == 2 -__contract__( - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */ - (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && - r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) - assigns(memory_slice(r0, sizeof(poly))) - assigns(memory_slice(r1, sizeof(poly))) - assigns(memory_slice(r2, sizeof(poly))) - assigns(memory_slice(r3, sizeof(poly))) - ensures( - array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); -); -#elif MLKEM_K == 4 -__contract__( - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - requires( /* Case B: r0, r1, r2, r3 consecutive */ - (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3)) - assigns(memory_slice(r0, sizeof(poly))) - assigns(memory_slice(r1, sizeof(poly))) - assigns(memory_slice(r2, sizeof(poly))) - assigns(memory_slice(r3, sizeof(poly))) - ensures( - array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); -); -#elif MLKEM_K == 3 -__contract__( - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - requires( /* Case C: r0, r1, r2 consecutive */ - (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) && - r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0))) - assigns(memory_slice(r0, sizeof(poly))) - assigns(memory_slice(r1, sizeof(poly))) - assigns(memory_slice(r2, sizeof(poly))) - assigns(memory_slice(r3, sizeof(poly))) - ensures( - array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); -); -#endif /* MLKEM_K */ - -#if MLKEM_ETA1 == MLKEM_ETA2 -/* - * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024 - * where MLKEM_ETA2 = MLKEM_ETA1 = 2. - * For ml-kem-512, poly_getnoise_eta1122_4x is used instead. - */ -#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x -#endif /* MLKEM_ETA1 == MLKEM_ETA2 */ - -#if MLKEM_K == 2 || MLKEM_K == 4 -#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2) -/************************************************* - * Name: poly_getnoise_eta2 - * - * Description: Sample a polynomial deterministically from a seed and a nonce, - * with output polynomial close to centered binomial distribution - * with parameter MLKEM_ETA2 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *seed: pointer to input seed - * (of length MLKEM_SYMBYTES bytes) - * - uint8_t nonce: one-byte input nonce - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - assigns(object_whole(r)) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)) -); -#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ - -#if MLKEM_K == 2 -#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x) -/************************************************* - * Name: poly_getnoise_eta1122_4x - * - * Description: Batch sample four polynomials deterministically from a seed - * and a nonces, with output polynomials close to centered binomial - * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2 - * - * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial - * - const uint8_t *seed: pointer to input seed - * (of length MLKEM_SYMBYTES bytes) - * - uint8_t nonce{0,1,2,3}: one-byte input nonce - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, - uint8_t nonce3) -__contract__( - requires( /* r0, r1 consecutive, r2, r3 consecutive */ - (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && - r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3)) - ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)); -); -#endif /* MLKEM_K == 2 */ - -#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.h deleted file mode 100644 index b432a4201..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef REDUCE_H -#define REDUCE_H - -#include -#include "cbmc.h" -#include "common.h" -#include "debug.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) -#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) -#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) -#define fqmul MLKEM_NAMESPACE(fqmul) -#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) -/* End of static namespacing */ - -#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ - -/************************************************* - * Name: cast_uint16_to_int16 - * - * Description: Cast uint16 value to int16 - * - * Returns: - * input x in 0 .. 32767: returns value unchanged - * input x in 32768 .. 65535: returns (x - 65536) - **************************************************/ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "conversion" -#endif -ALWAYS_INLINE -static INLINE int16_t cast_uint16_to_int16(uint16_t x) -{ - /* - * PORTABILITY: This relies on uint16_t -> int16_t - * being implemented as the inverse of int16_t -> uint16_t, - * which is implementation-defined (C99 6.3.1.3 (3)) - * CBMC (correctly) fails to prove this conversion is OK, - * so we have to suppress that check here - */ - return (int16_t)x; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************* - * Name: montgomery_reduce_generic - * - * Description: Generic Montgomery reduction; given a 32-bit integer a, computes - * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 - * - * Arguments: - int32_t a: input integer to be reduced - * - * Returns: integer congruent to a * R^-1 modulo q, with absolute value - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 - * - **************************************************/ -ALWAYS_INLINE -static INLINE int16_t montgomery_reduce_generic(int32_t a) -{ - /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ - const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - - /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ - const uint16_t a_reduced = a & UINT16_MAX; - const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - - /* Lift to signed canonical representative mod 2^16. */ - const int16_t t = cast_uint16_to_int16(a_inverted); - - int32_t r = a - ((int32_t)t * MLKEM_Q); - /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - - /* - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - r = r >> 16; - /* Bounds: |r >> 16| <= ceil(|r| / 2^16) - * <= ceil(|a| / 2^16 + MLKEM_Q / 2) - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * - * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) - */ - - return (int16_t)r; -} - -/************************************************* - * Name: montgomery_reduce - * - * Description: Montgomery reduction - * - * Arguments: - int32_t a: input integer to be reduced - * Must be smaller than 2 * 2^12 * 2^15 in absolute value. - * - * Returns: integer congruent to a * R^-1 modulo q, - * smaller than 2 * q in absolute value. - **************************************************/ -static INLINE int16_t montgomery_reduce(int32_t a) -__contract__( - requires(a > -(2 * UINT12_LIMIT * 32768)) - requires(a < (2 * UINT12_LIMIT * 32768)) - ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); - - res = montgomery_reduce_generic(a); - /* Bounds: - * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 - * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 - * < 2 * MLKEM_Q */ - - debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); - return res; -} - -/************************************************* - * Name: fqmul - * - * Description: Montgomery multiplication modulo q=3329 - * - * Arguments: - int16_t a: first factor - * Can be any int16_t. - * - int16_t b: second factor. - * Must be signed canonical (abs value <(q+1)/2) - * - * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and - * smaller than q in absolute value. - * - **************************************************/ -static INLINE int16_t fqmul(int16_t a, int16_t b) -__contract__( - requires(b > -HALF_Q) - requires(b < HALF_Q) - ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&b, 1, HALF_Q); - - res = montgomery_reduce((int32_t)a * (int32_t)b); - /* Bounds: - * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 - * < MLKEM_Q - */ - - debug_assert_abs_bound(&res, 1, MLKEM_Q); - return res; -} - -/************************************************* - * Name: barrett_reduce - * - * Description: Barrett reduction; given a 16-bit integer a, computes - * centered representative congruent to a mod q in - * {-(q-1)/2,...,(q-1)/2} - * - * Arguments: - int16_t a: input integer to be reduced - * - * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. - **************************************************/ -static INLINE int16_t barrett_reduce(int16_t a) -__contract__( - ensures(return_value > -HALF_Q && return_value < HALF_Q) -) -{ - /* - * To divide by MLKEM_Q using Barrett multiplication, the "magic number" - * multiplier is round_to_nearest(2**26/MLKEM_Q) - */ - const int BPOWER = 26; - const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - - /* - * Compute round_to_nearest(a/MLKEM_Q) using the multiplier - * above and shift by BPOWER places. - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; - - /* - * t is in -10 .. +10, so we need 32-bit math to - * evaluate t * MLKEM_Q and the subsequent subtraction - */ - int16_t res = (int16_t)(a - t * MLKEM_Q); - - debug_assert_abs_bound(&res, 1, HALF_Q); - return res; -} - -#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.c deleted file mode 100644 index cbbe4407f..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.c +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) - -#include "arith_backend.h" -#include "debug.h" -#include "fips202.h" -#include "fips202x4.h" -#include "rej_uniform.h" -#include "symmetric.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define rej_uniform MLKEM_NAMESPACE(rej_uniform) -#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar) -/* End of static namespacing */ - -static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target, - unsigned int offset, const uint8_t *buf, - unsigned int buflen) -__contract__( - requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) - requires(memory_no_alias(r, sizeof(int16_t) * target)) - requires(memory_no_alias(buf, buflen)) - requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * target)) - ensures(offset <= return_value && return_value <= target) - ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) -) -{ - unsigned int ctr, pos; - uint16_t val0, val1; - - debug_assert_bound(r, offset, 0, MLKEM_Q); - - ctr = offset; - pos = 0; - /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */ - while (ctr < target && pos + 3 <= buflen) - __loop__( - invariant(offset <= ctr && ctr <= target && pos <= buflen) - invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q))) - { - val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; - val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; - pos += 3; - - if (val0 < MLKEM_Q) - { - r[ctr++] = val0; - } - if (ctr < target && val1 < MLKEM_Q) - { - r[ctr++] = val1; - } - } - - debug_assert_bound(r, ctr, 0, MLKEM_Q); - return ctr; -} - -#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM) -/************************************************* - * Name: rej_uniform - * - * Description: Run rejection sampling on uniform random bytes to generate - * uniform random integers mod q - * - * Arguments: - int16_t *r: pointer to output buffer - * - unsigned int target: requested number of 16-bit integers - * (uniform mod q). - * Must be <= 4096. - * - unsigned int offset: number of 16-bit integers that have - * already been sampled. - * Must be <= target. - * - const uint8_t *buf: pointer to input buffer - * (assumed to be uniform random bytes) - * - unsigned int buflen: length of input buffer in bytes - * Must be <= 4096. - * Must be a multiple of 3. - * - * Note: Strictly speaking, only a few values of buflen near UINT_MAX need - * excluding. The limit of 4096 is somewhat arbitary but sufficient for all - * uses of this function. Similarly, the actual limit for target is UINT_MAX/2. - * - * Returns the new offset of sampled 16-bit integers, at most target, - * and at least the initial offset. - * If the new offset is strictly less than len, all of the input buffers - * is guaranteed to have been consumed. If it is equal to len, no information - * is provided on how many bytes of the input buffer have been consumed. - **************************************************/ - -/* - * NOTE: The signature differs from the Kyber reference implementation - * in that it adds the offset and always expects the base of the target - * buffer. This avoids shifting the buffer base in the caller, which appears - * tricky to reason about. - */ -static unsigned int rej_uniform(int16_t *r, unsigned int target, - unsigned int offset, const uint8_t *buf, - unsigned int buflen) -__contract__( - requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) - requires(memory_no_alias(r, sizeof(int16_t) * target)) - requires(memory_no_alias(buf, buflen)) - requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * target)) - ensures(offset <= return_value && return_value <= target) - ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) -) -{ - return rej_uniform_scalar(r, target, offset, buf, buflen); -} -#else /* MLKEM_USE_NATIVE_REJ_UNIFORM */ -static unsigned int rej_uniform(int16_t *r, unsigned int target, - unsigned int offset, const uint8_t *buf, - unsigned int buflen) -{ - int ret; - - /* Sample from large buffer with full lane as much as possible. */ - ret = rej_uniform_native(r + offset, target - offset, buf, buflen); - if (ret != -1) - { - unsigned res = offset + (unsigned)ret; - debug_assert_bound(r, res, 0, MLKEM_Q); - return res; - } - - return rej_uniform_scalar(r, target, offset, buf, buflen); -} -#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ - -#ifndef MLKEM_GEN_MATRIX_NBLOCKS -#define MLKEM_GEN_MATRIX_NBLOCKS \ - ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE) -#endif - -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) -{ - /* Temporary buffers for XOF output before rejection sampling */ - uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - - /* Tracks the number of coefficients we have already sampled */ - unsigned int ctr[KECCAK_WAY]; - xof_x4_ctx statex; - unsigned int buflen; - - shake128x4_inc_init(&statex); - - /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */ - xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3], - MLKEM_SYMBYTES + 2); - - /* - * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. - * This should generate the matrix entries with high probability. - */ - xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS, - &statex); - buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; - ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen); - ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen); - ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen); - ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen); - - /* - * So long as not all matrix entries have been generated, squeeze - * one more block a time until we're done. - */ - buflen = XOF_RATE; - while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N || - ctr[3] < MLKEM_N) - __loop__( - assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0), - object_whole(buf1), object_whole(buf2), object_whole(buf3)) - invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N) - invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N) - invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q)) - invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q)) - invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q)) - invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q))) - { - xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex); - ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen); - ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen); - ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen); - ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen); - } - - xof_x4_release(&statex); -} - -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) -{ - xof_ctx state; - uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - unsigned int ctr, buflen; - - shake128_inc_init(&state); - - xof_absorb(&state, seed, MLKEM_SYMBYTES + 2); - - /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. - */ - /* This should generate the matrix entry with high probability. */ - xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state); - buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; - ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen); - - /* Squeeze + sample one more block a time until we're done */ - buflen = XOF_RATE; - while (ctr < MLKEM_N) - __loop__( - assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf)) - invariant(ctr <= MLKEM_N) - invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q))) - { - xof_squeezeblocks(buf, 1, &state); - ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen); - } - - xof_release(&state); -} - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform) -int empty_cu_rej_uniform; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.h deleted file mode 100644 index 801287259..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef REJ_UNIFORM_H -#define REJ_UNIFORM_H - -#include -#include -#include "cbmc.h" -#include "common.h" -#include "poly.h" - -#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4) -/************************************************* - * Name: poly_rej_uniform_x4 - * - * Description: Generate four polynomials using rejection sampling - * on (pseudo-)uniformly random bytes sampled from a seed. - * - * Arguments: - poly *vec: Pointer to an array of 4 polynomials - * to be sampled. - * - uint8_t *seed[4]: Pointer to array of four pointers - * pointing to the seed buffers of size - * MLKEM_SYMBYTES + 2 each. - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) -__contract__( - requires(memory_no_alias(vec, sizeof(poly) * 4)) - requires(memory_no_alias(seed, sizeof(uint8_t*) * 4)) - requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2)) - requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2)) - requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2)) - requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2)) - assigns(memory_slice(vec, sizeof(poly) * 4)) - ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q))); - -#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform) -/************************************************* - * Name: poly_rej_uniform - * - * Description: Generate polynomial using rejection sampling - * on (pseudo-)uniformly random bytes sampled from a seed. - * - * Arguments: - poly *vec: Pointer to polynomial to be sampled. - * - uint8_t *seed: Pointer to seed buffer of size - * MLKEM_SYMBYTES + 2 each. - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) -__contract__( - requires(memory_no_alias(entry, sizeof(poly))) - requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2)) - assigns(memory_slice(entry, sizeof(poly))) - ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q))); - -#endif /* REJ_UNIFORM_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/sampling.c new file mode 100644 index 000000000..98cbdcb74 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/sampling.c @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "common.h" +#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) + +#include "arith_backend.h" +#include "debug.h" +#include "fips202.h" +#include "fips202x4.h" +#include "sampling.h" +#include "symmetric.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define rej_uniform MLKEM_NAMESPACE(rej_uniform) +#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar) +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) +/* End of static namespacing */ + +static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +__contract__( + requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) + requires(memory_no_alias(r, sizeof(int16_t) * target)) + requires(memory_no_alias(buf, buflen)) + requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * target)) + ensures(offset <= return_value && return_value <= target) + ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) +) +{ + unsigned int ctr, pos; + uint16_t val0, val1; + + debug_assert_bound(r, offset, 0, MLKEM_Q); + + ctr = offset; + pos = 0; + /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */ + while (ctr < target && pos + 3 <= buflen) + __loop__( + invariant(offset <= ctr && ctr <= target && pos <= buflen) + invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q))) + { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; + + if (val0 < MLKEM_Q) + { + r[ctr++] = val0; + } + if (ctr < target && val1 < MLKEM_Q) + { + r[ctr++] = val1; + } + } + + debug_assert_bound(r, ctr, 0, MLKEM_Q); + return ctr; +} + +#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM) +/************************************************* + * Name: rej_uniform + * + * Description: Run rejection sampling on uniform random bytes to generate + * uniform random integers mod q + * + * Arguments: - int16_t *r: pointer to output buffer + * - unsigned int target: requested number of 16-bit integers + * (uniform mod q). + * Must be <= 4096. + * - unsigned int offset: number of 16-bit integers that have + * already been sampled. + * Must be <= target. + * - const uint8_t *buf: pointer to input buffer + * (assumed to be uniform random bytes) + * - unsigned int buflen: length of input buffer in bytes + * Must be <= 4096. + * Must be a multiple of 3. + * + * Note: Strictly speaking, only a few values of buflen near UINT_MAX need + * excluding. The limit of 4096 is somewhat arbitary but sufficient for all + * uses of this function. Similarly, the actual limit for target is UINT_MAX/2. + * + * Returns the new offset of sampled 16-bit integers, at most target, + * and at least the initial offset. + * If the new offset is strictly less than len, all of the input buffers + * is guaranteed to have been consumed. If it is equal to len, no information + * is provided on how many bytes of the input buffer have been consumed. + **************************************************/ + +/* + * NOTE: The signature differs from the Kyber reference implementation + * in that it adds the offset and always expects the base of the target + * buffer. This avoids shifting the buffer base in the caller, which appears + * tricky to reason about. + */ +static unsigned int rej_uniform(int16_t *r, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +__contract__( + requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) + requires(memory_no_alias(r, sizeof(int16_t) * target)) + requires(memory_no_alias(buf, buflen)) + requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * target)) + ensures(offset <= return_value && return_value <= target) + ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) +) +{ + return rej_uniform_scalar(r, target, offset, buf, buflen); +} +#else /* MLKEM_USE_NATIVE_REJ_UNIFORM */ +static unsigned int rej_uniform(int16_t *r, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +{ + int ret; + + /* Sample from large buffer with full lane as much as possible. */ + ret = rej_uniform_native(r + offset, target - offset, buf, buflen); + if (ret != -1) + { + unsigned res = offset + (unsigned)ret; + debug_assert_bound(r, res, 0, MLKEM_Q); + return res; + } + + return rej_uniform_scalar(r, target, offset, buf, buflen); +} +#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ + +#ifndef MLKEM_GEN_MATRIX_NBLOCKS +#define MLKEM_GEN_MATRIX_NBLOCKS \ + ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE) +#endif + +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) +{ + /* Temporary buffers for XOF output before rejection sampling */ + uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + + /* Tracks the number of coefficients we have already sampled */ + unsigned int ctr[KECCAK_WAY]; + xof_x4_ctx statex; + unsigned int buflen; + + shake128x4_inc_init(&statex); + + /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */ + xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3], + MLKEM_SYMBYTES + 2); + + /* + * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. + * This should generate the matrix entries with high probability. + */ + xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS, + &statex); + buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; + ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen); + ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen); + ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen); + ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen); + + /* + * So long as not all matrix entries have been generated, squeeze + * one more block a time until we're done. + */ + buflen = XOF_RATE; + while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N || + ctr[3] < MLKEM_N) + __loop__( + assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0), + object_whole(buf1), object_whole(buf2), object_whole(buf3)) + invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N) + invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N) + invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q)) + invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q)) + invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q)) + invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q))) + { + xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex); + ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen); + ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen); + ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen); + ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen); + } + + xof_x4_release(&statex); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) +{ + xof_ctx state; + uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + unsigned int ctr, buflen; + + shake128_inc_init(&state); + + xof_absorb(&state, seed, MLKEM_SYMBYTES + 2); + + /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. + */ + /* This should generate the matrix entry with high probability. */ + xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state); + buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; + ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen); + + /* Squeeze + sample one more block a time until we're done */ + buflen = XOF_RATE; + while (ctr < MLKEM_N) + __loop__( + assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf)) + invariant(ctr <= MLKEM_N) + invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q))) + { + xof_squeezeblocks(buf, 1, &state); + ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen); + } + + xof_release(&state); +} + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) +/* End of static namespacing */ + +/************************************************* + * Name: load32_littleendian + * + * Description: load 4 bytes into a 32-bit integer + * in little-endian order + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x + **************************************************/ +static uint32_t load32_littleendian(const uint8_t x[4]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + r |= (uint32_t)x[3] << 24; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) + { + unsigned j; + uint32_t t = load32_littleendian(buf + 4 * i); + uint32_t d = t & 0x55555555; + d += (t >> 1) & 0x55555555; + + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) + { + const int16_t a = (d >> (4 * j + 0)) & 0x3; + const int16_t b = (d >> (4 * j + 2)) & 0x3; + r->coeffs[8 * i + j] = a - b; + } + } +} + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +/************************************************* + * Name: load24_littleendian + * + * Description: load 3 bytes into a 32-bit integer + * in little-endian order. + * This function is only needed for ML-KEM-512 + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) + **************************************************/ +static uint32_t load24_littleendian(const uint8_t x[3]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 4; i++) + __loop__( + invariant(i <= MLKEM_N / 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) + { + unsigned j; + const uint32_t t = load24_littleendian(buf + 3 * i); + uint32_t d = t & 0x00249249; + d += (t >> 1) & 0x00249249; + d += (t >> 2) & 0x00249249; + + for (j = 0; j < 4; j++) + __loop__( + invariant(i <= MLKEM_N / 4 && j <= 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) + { + const int16_t a = (d >> (6 * j + 0)) & 0x7; + const int16_t b = (d >> (6 * j + 3)) & 0x7; + r->coeffs[4 * i + j] = a - b; + } + } +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ + 3 */ + +#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ + +#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling) +int empty_cu_sampling; + +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/sampling.h new file mode 100644 index 000000000..cc524e0fc --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/sampling.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef SAMPLING_H +#define SAMPLING_H + +#include +#include +#include "cbmc.h" +#include "common.h" +#include "poly.h" + +#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) +/************************************************* + * Name: poly_cbd2 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=2 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) +/************************************************* + * Name: poly_cbd3 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=3. + * This function is only needed for ML-KEM-512 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ + +#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4) +/************************************************* + * Name: poly_rej_uniform_x4 + * + * Description: Generate four polynomials using rejection sampling + * on (pseudo-)uniformly random bytes sampled from a seed. + * + * Arguments: - poly *vec: Pointer to an array of 4 polynomials + * to be sampled. + * - uint8_t *seed[4]: Pointer to array of four pointers + * pointing to the seed buffers of size + * MLKEM_SYMBYTES + 2 each. + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) +__contract__( + requires(memory_no_alias(vec, sizeof(poly) * 4)) + requires(memory_no_alias(seed, sizeof(uint8_t*) * 4)) + requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2)) + requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2)) + requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2)) + requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2)) + assigns(memory_slice(vec, sizeof(poly) * 4)) + ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q))); + +#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform) +/************************************************* + * Name: poly_rej_uniform + * + * Description: Generate polynomial using rejection sampling + * on (pseudo-)uniformly random bytes sampled from a seed. + * + * Arguments: - poly *vec: Pointer to polynomial to be sampled. + * - uint8_t *seed: Pointer to seed buffer of size + * MLKEM_SYMBYTES + 2 each. + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) +__contract__( + requires(memory_no_alias(entry, sizeof(poly))) + requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2)) + assigns(memory_slice(entry, sizeof(poly))) + ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q))); + +#endif /* SAMPLING_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c index 4ef887c62..987f0dce4 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c @@ -10,7 +10,7 @@ #include "common.h" #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) -#include "ntt.h" +#include "poly.h" /* * Table of zeta values used in the reference NTT and inverse NTT. diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/api.h deleted file mode 100644 index 792ecb8a4..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/api.h +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * Native arithmetic interface - * - * This header is primarily for documentation purposes. - * It should not be included by backend implementations. - * - * To ensure consistency with backends, the header will be - * included automatically after inclusion of the active - * backend, to ensure consistency of function signatures, - * and run sanity checks. - */ -#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H -#error \ - "The arithmetic backend API `mlkem/native/api.h` " \ - "should not be directly included. Please include the relevant " \ - "structure headers directly." -#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ -#define MLKEM_NATIVE_ARITH_NATIVE_API_H - -#include -#include "poly.h" -#include "polyvec.h" - -/* - * This is the C<->native interface allowing for the drop-in of - * native code for performance critical arithmetic components of ML-KEM. - * - * A _backend_ is a specific implementation of (part of) this interface. - * - * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and - * implement `static inline xxx(...)` in the profile header. - * - * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can - * be set if there are native implementations for all of NTT, invNTT, and - * base multiplication, and allows the native implementation to use a - * custom order of polynomial coefficients in NTT domain -- the use of such - * custom order is not an implementation-detail since the public matrix - * is generated in NTT domain. In this case, a permutation function - * poly_permute_bitrev_to_custom() needs to be provided that permutes - * polynomials in NTT domain from bitreversed to the custom order. - */ - -/* - * Those functions are meant to be trivial wrappers around the chosen native - * implementation. The are static inline to avoid unnecessary calls. - * The macro before each declaration controls whether a native - * implementation is present. - */ - -#if defined(MLKEM_USE_NATIVE_NTT) -/************************************************* - * Name: ntt_native - * - * Description: Computes negacyclic number-theoretic transform (NTT) of - * a polynomial in place. - * - * The input polynomial is assumed to be in normal order. - * The output polynomial is in bitreversed order, or of a - * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * - * Arguments: - poly *p: pointer to in/output polynomial - **************************************************/ -static INLINE void ntt_native(poly *); -#endif /* MLKEM_USE_NATIVE_NTT */ - -#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER) -/* - * This must only be set if NTT, invNTT, basemul, mulcache, and - * to/from byte stream conversions all have native implementations - * that are adapted to the custom order. - */ -#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \ - !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ - !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) || \ - !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) || \ - !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -#error \ - "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \ -set if there are native implementations for NTT, invNTT, mulcache, basemul, \ -and to/from bytes conversions." -#endif - -/************************************************* - * Name: poly_permute_bitrev_to_custom - * - * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined, - * convert a polynomial in NTT domain from bitreversed - * order to the custom order output by the native NTT. - * - * This must only be defined if there is native code for - * all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache. - * Arguments: - poly *p: pointer to in/output polynomial - * - **************************************************/ -static INLINE void poly_permute_bitrev_to_custom(poly *); -#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ - -#if defined(MLKEM_USE_NATIVE_INTT) -/************************************************* - * Name: intt_native - * - * Description: Computes inverse of negacyclic number-theoretic transform (NTT) - * of a polynomial in place. - * - * The input polynomial is in bitreversed order, or of a - * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * The output polynomial is assumed to be in normal order. - * - * Arguments: - uint16_t *a: pointer to in/output polynomial - **************************************************/ -static INLINE void intt_native(poly *); -#endif /* MLKEM_USE_NATIVE_INTT */ - -#if defined(MLKEM_USE_NATIVE_POLY_REDUCE) -/************************************************* - * Name: poly_reduce_native - * - * Description: Applies modular reduction to all coefficients of a polynomial. - * - * Arguments: - poly *r: pointer to input/output polynomial - **************************************************/ -static INLINE void poly_reduce_native(poly *); -#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ - -#if defined(MLKEM_USE_NATIVE_POLY_TOMONT) -/************************************************* - * Name: poly_tomont_native - * - * Description: Inplace conversion of all coefficients of a polynomial - * from normal domain to Montgomery domain - * - * Arguments: - poly *r: pointer to input/output polynomial - **************************************************/ -static INLINE void poly_tomont_native(poly *); -#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ - -#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) -/************************************************* - * Name: poly_mulcache_compute_native - * - * Description: Compute multiplication cache for a polynomial - * in NTT domain. - * - * The purpose of the multiplication cache is to - * cache repeated computations required during a - * base multiplication of polynomials in NTT domain. - * The structure of the multiplication-cache is - * implementation defined. - * - * Arguments: INPUT: - * - poly: const pointer to input polynomial. - * This must be in NTT domain and inin bitreversed order, or of - * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * OUTPUT - * - cache: pointer to multiplication cache - **************************************************/ -static INLINE void poly_mulcache_compute_native(poly_mulcache *cache, - const poly *poly); -#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ - -#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) -/************************************************* - * Name: poly_mulcache_compute_native - * - * Description: Compute multiplication of polynomials in NTT domain. - * - * Arguments: INPUT: - * - a: First polynomial operand. - * This must be in NTT domain and inin bitreversed order, or of - * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * - b: Second polynomial operand. - * As for a. - * - b_cache: Multiplication-cache for b. - * OUTPUT - * - r: Result of the base multiplication. This is again - * in NTT domain, and of the same order as a and b. - **************************************************/ -static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache); -#endif - -#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES) -/************************************************* - * Name: poly_tobytes_native - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - a: const pointer to input polynomial, - * with each coefficient in the range -Q+1 .. Q-1 - * OUTPUT - * - r: pointer to output byte array - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a); -#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ - -#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -/************************************************* - * Name: poly_frombytes_native - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - r: pointer to output polynomial in NTT domain - * OUTPUT - * - a: const pointer to input byte aray - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -static INLINE void poly_frombytes_native(poly *a, - const uint8_t r[MLKEM_POLYBYTES]); -#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ - -#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM) -/************************************************* - * Name: rej_uniform_native - * - * Description: Run rejection sampling on uniform random bytes to generate - * uniform random integers mod q - * - * Arguments: - int16_t *r: pointer to output buffer - * - unsigned int len: requested number of 16-bit integers - * (uniform mod q). - * - const uint8_t *buf: pointer to input buffer - * (assumed to be uniform random bytes) - * - unsigned int buflen: length of input buffer in bytes. - * - * Return -1 if the native implementation does not support the input lengths. - * Otherwise, returns non-negative number of sampled 16-bit integers (at most - * len). - **************************************************/ -static INLINE int rej_uniform_native(int16_t *r, unsigned int len, - const uint8_t *buf, unsigned int buflen); -#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ - -#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/arith_backend.h index 0543b1bd1..ade31cda1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/arith_backend.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/arith_backend.h @@ -17,7 +17,7 @@ * Keep this _after_ the inclusion of the backend; otherwise, * the sanity checks won't have an effect. */ #if defined(MLKEM_NATIVE_CHECK_APIS) -#include "api.h" +#include "native/api.h" #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.c deleted file mode 100644 index 1e6b7c5d1..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED - -#include -#include "cbd.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) -#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) -/* End of static namespacing */ - -/************************************************* - * Name: load32_littleendian - * - * Description: load 4 bytes into a 32-bit integer - * in little-endian order - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x - **************************************************/ -static uint32_t load32_littleendian(const uint8_t x[4]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - r |= (uint32_t)x[3] << 24; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) - { - unsigned j; - uint32_t t = load32_littleendian(buf + 4 * i); - uint32_t d = t & 0x55555555; - d += (t >> 1) & 0x55555555; - - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) - { - const int16_t a = (d >> (4 * j + 0)) & 0x3; - const int16_t b = (d >> (4 * j + 2)) & 0x3; - r->coeffs[8 * i + j] = a - b; - } - } -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -/************************************************* - * Name: load24_littleendian - * - * Description: load 3 bytes into a 32-bit integer - * in little-endian order. - * This function is only needed for ML-KEM-512 - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) - **************************************************/ -static uint32_t load24_littleendian(const uint8_t x[3]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 4; i++) - __loop__( - invariant(i <= MLKEM_N / 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) - { - unsigned j; - const uint32_t t = load24_littleendian(buf + 3 * i); - uint32_t d = t & 0x00249249; - d += (t >> 1) & 0x00249249; - d += (t >> 2) & 0x00249249; - - for (j = 0; j < 4; j++) - __loop__( - invariant(i <= MLKEM_N / 4 && j <= 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) - { - const int16_t a = (d >> (6 * j + 0)) & 0x7; - const int16_t b = (d >> (6 * j + 3)) & 0x7; - r->coeffs[4 * i + j] = a - b; - } - } -} -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ - 3 */ - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd) -int empty_cu_cbd; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.h deleted file mode 100644 index 54c1f5b90..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef CBD_H -#define CBD_H - -#include -#include "common.h" -#include "poly.h" - -#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) -/************************************************* - * Name: poly_cbd2 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=2 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) -/************************************************* - * Name: poly_cbd3 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=3. - * This function is only needed for ML-KEM-512 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ - -#endif /* CBD_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h index 4f326333e..62ed53ab1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h @@ -15,12 +15,19 @@ #include "sys.h" /* Include backend metadata */ -#if defined(MLKEM_USE_NATIVE) -#if defined(MLKEM_NATIVE_ARITH_BACKEND) -#include MLKEM_NATIVE_ARITH_BACKEND +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#include MLKEM_NATIVE_ARITH_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not. +#endif #endif -#if defined(MLKEM_NATIVE_FIPS202_BACKEND) -#include MLKEM_NATIVE_FIPS202_BACKEND + +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#include MLKEM_NATIVE_FIPS202_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not. #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/compress.c new file mode 100644 index 000000000..a03fe0ac4 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/compress.c @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "common.h" +#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) + +#include +#include +#include "arith_backend.h" +#include "cbmc.h" +#include "compress.h" +#include "debug.h" +#include "verify.h" + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 16))) + { + t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); + } + + r[i * 4] = t[0] | (t[1] << 4); + r[i * 4 + 1] = t[2] | (t[3] << 4); + r[i * 4 + 2] = t[4] | (t[5] << 4); + r[i * 4 + 3] = t[6] | (t[7] << 4); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + for (j = 0; j < MLKEM_N / 4; j++) + __loop__(invariant(j <= MLKEM_N / 4)) + { + unsigned k; + uint16_t t[4]; + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(forall(r, 0, k, t[r] < (1u << 10)))) + { + t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 10-bit in size. + */ + r[5 * j + 0] = (t[0] >> 0) & 0xFF; + r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); + r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); + r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); + r[5 * j + 4] = (t[3] >> 2); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) + { + r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); + r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 4; j++) + __loop__( + invariant(j <= MLKEM_N / 4) + invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[4]; + uint8_t const *base = &a[5 * j]; + + t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); + t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); + t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); + + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) + { + r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 32))) + { + t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); + } + + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC, and use array indexing into + * r rather than pointer-arithmetic to simplify verification + */ + r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); + r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); + r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); + r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); + r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (j = 0; j < MLKEM_N / 8; j++) + __loop__(invariant(j <= MLKEM_N / 8)) + { + unsigned k; + uint16_t t[8]; + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(forall(r, 0, k, t[r] < (1u << 11)))) + { + t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 11-bit in size. + */ + r[11 * j + 0] = (t[0] >> 0) & 0xFF; + r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); + r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); + r[11 * j + 3] = (t[2] >> 2) & 0xFF; + r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); + r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); + r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); + r[11 * j + 7] = (t[5] >> 1) & 0xFF; + r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); + r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); + r[11 * j + 10] = (t[7] >> 3); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + uint8_t t[8]; + const unsigned offset = i * 5; + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC and unwind loop for ease + * of proof. + */ + + /* + * Decompress 5 8-bit bytes (so 40 bits) into + * 8 5-bit values stored in t[] + */ + t[0] = 0x1F & (a[offset + 0] >> 0); + t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); + t[2] = 0x1F & (a[offset + 1] >> 2); + t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); + t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); + t[5] = 0x1F & (a[offset + 3] >> 1); + t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); + t[7] = 0x1F & (a[offset + 4] >> 3); + + /* and copy to the correct slice in r[] */ + for (j = 0; j < 8; j++) + __loop__( + invariant(j <= 8 && i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 8; j++) + __loop__( + invariant(j <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[8]; + uint8_t const *base = &a[11 * j]; + t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); + t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | + ((uint16_t)base[4] << 10)); + t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); + t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); + t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | + ((uint16_t)base[8] << 9)); + t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); + t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); + + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) + { + r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ + +#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 2; i++) + __loop__(invariant(i <= MLKEM_N / 2)) + { + const uint16_t t0 = a->coeffs[2 * i]; + const uint16_t t1 = a->coeffs[2 * i + 1]; + /* + * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of + * significant data, so these can be packed into 24 bits or exactly + * 3 bytes, as follows. + */ + + /* Least significant bits 0 - 7 of t0. */ + r[3 * i + 0] = t0 & 0xFF; + + /* + * Most significant bits 8 - 11 of t0 become the least significant + * nibble of the second byte. The least significant 4 bits + * of t1 become the upper nibble of the second byte. + */ + r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + + /* Bits 4 - 11 of t1 become the third byte. */ + r[3 * i + 2] = t1 >> 4; + } +} +#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + poly_tobytes_native(r, a->coeffs); +} +#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + +#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) + { + const uint8_t t0 = a[3 * i + 0]; + const uint8_t t1 = a[3 * i + 1]; + const uint8_t t2 = a[3 * i + 2]; + r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); + r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); + } + + /* Note that the coefficients are not canonical */ + debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); +} +#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + poly_frombytes_native(r->coeffs, a); +} +#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ + +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +{ + unsigned i; +#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) +#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" +#endif + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + for (j = 0; j < 8; j++) + __loop__( + invariant(i < MLKEM_N / 8 && j <= 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + /* Prevent the compiler from recognizing this as a bit selection */ + uint8_t mask = value_barrier_u8(1u << j); + r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); + } + } + debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + msg[i] = 0; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8)) + { + uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); + msg[i] |= t << j; + } + } +} + +#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ + +#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress) +int empty_cu_compress; + +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/compress.h new file mode 100644 index 000000000..409dbe519 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/compress.h @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef COMPRESS_H +#define COMPRESS_H + +#include +#include +#include "cbmc.h" +#include "common.h" +#include "debug.h" +#include "poly.h" +#include "verify.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) +#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) +#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) +#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) +#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) +#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) +#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) +#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) +#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) +/* End of static namespacing */ + +/************************************************************ + * Name: scalar_compress_d1 + * + * Description: Computes round(u * 2 / q) + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 1. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d1(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 2) + ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) +{ + uint32_t d0 = u << 1; + d0 *= 645083; + d0 += 1u << 30; + d0 >>= 31; + return d0; +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_compress_d4 + * + * Description: Computes round(u * 16 / q) % 16 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d4(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 16) + ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) +{ + uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ + return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d4 + * + * Description: Computes round(u * q / 16) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d4(uint32_t u) +__contract__( + requires(0 <= u && u < 16) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 8) / 16; } + +/************************************************************ + * Name: scalar_compress_d5 + * + * Description: Computes round(u * 32 / q) % 32 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d5(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 32) + ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) +{ + uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ + return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d5 + * + * Description: Computes round(u * q / 32) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo 32 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d5(uint32_t u) +__contract__( + requires(0 <= u && u < 32) + ensures(return_value <= MLKEM_Q - 1) +) { return ((u * MLKEM_Q) + 16) / 32; } + +/************************************************************ + * Name: scalar_compress_d10 + * + * Description: Computes round(u * 2**10 / q) % 2**10 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d10(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 10)) + ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) +{ + uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x3FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d10 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d10(uint32_t u) +__contract__( + requires(0 <= u && u < 1024) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 512) / 1024; } + +/************************************************************ + * Name: scalar_compress_d11 + * + * Description: Computes round(u * 2**11 / q) % 2**11 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 11. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d11(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 11)) + ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) +{ + uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x7FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d11 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d11(uint32_t u) +__contract__( + requires(0 <= u && u < 2048) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 1024) / 2048; } + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ + (MLKEM_K == 2 || MLKEM_K == 3) +#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) +/************************************************* + * Name: poly_compress_d4 + * + * Description: Compression (4 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); + +#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) +/************************************************* + * Name: poly_compress_d10 + * + * Description: Compression (10 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); + +#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) +/************************************************* + * Name: poly_decompress_d4 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); + +#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) +/************************************************* + * Name: poly_decompress_d10 + * + * Description: De-serialization and subsequent decompression (10 bits) of a + * polynomial; approximate inverse of poly_compress_d10 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) +/************************************************* + * Name: poly_compress_d5 + * + * Description: Compression (5 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); + +#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) +/************************************************* + * Name: poly_compress_d11 + * + * Description: Compression (11 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); + +#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) +/************************************************* + * Name: poly_decompress_d5 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); + +#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) +/************************************************* + * Name: poly_decompress_d11 + * + * Description: De-serialization and subsequent decompression (11 bits) of a + * polynomial; approximate inverse of poly_compress_d11 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ + */ + +#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) +/************************************************* + * Name: poly_tobytes + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - a: const pointer to input polynomial, + * with each coefficient in the range [0,1,..,Q-1] + * OUTPUT + * - r: pointer to output byte array + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYBYTES)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(r)) +); + + +#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) +/************************************************* + * Name: poly_frombytes + * + * Description: De-serialization of a polynomial. + * + * Arguments: INPUT + * - a: pointer to input byte array + * (of MLKEM_POLYBYTES bytes) + * OUTPUT + * - r: pointer to output polynomial, with + * each coefficient unsigned and in the range + * 0 .. 4095 + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) +); + + +#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) +/************************************************* + * Name: poly_frommsg + * + * Description: Convert 32-byte message to polynomial + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *msg: pointer to input message + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(object_whole(r)) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) +); + +#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) +/************************************************* + * Name: poly_tomsg + * + * Description: Convert polynomial to 32-byte message + * + * Arguments: - uint8_t *msg: pointer to output message + * - const poly *r: pointer to input polynomial + * Coefficients must be unsigned canonical + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(msg)) +); + +#endif /* COMPRESS_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h index fa89370ce..e975ede95 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h @@ -122,46 +122,87 @@ /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ /****************************************************************************** - * Name: MLKEM_USE_NATIVE + * Name: MLKEM_USE_NATIVE_BACKEND_ARITH * - * Description: Determines whether a native backend should - * be used, if available. + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. * * This can also be set using CFLAGS. * *****************************************************************************/ -#if !defined(MLKEM_USE_NATIVE) -/* #define MLKEM_USE_NATIVE */ +#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */ #endif /****************************************************************************** - * Name: MLKEM_NATIVE_ARITH_BACKEND + * Name: MLKEM_NATIVE_ARITH_BACKEND_FILE * * Description: The arithmetic backend to use. * - * This must be the filename of an arithmetic backend. - * See the existing backends for examples. + * If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND) -#define MLKEM_NATIVE_ARITH_BACKEND "default.h" -#endif /* MLKEM_NATIVE_ARITH_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \ + !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h" +#endif /****************************************************************************** - * Name: MLKEM_NATIVE_FIPS202_BACKEND + * Name: MLKEM_USE_NATIVE_BACKEND_FIPS202 + * + * Description: Determines whether an native FIPS202 backend should be used. + * + * The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is + * the performance bottleneck of SHA3 and SHAKE. + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the FIPS202 backend to be use is + * determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */ +#endif + +/****************************************************************************** + * Name: MLKEM_NATIVE_FIPS202_BACKEND_FILE * * Description: The FIPS-202 backend to use. * - * This must be the filename of an FIPS-202 backend. + * If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must + * either be undefined or the filename of a FIPS202 backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND) -#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h" -#endif /* MLKEM_NATIVE_FIPS202_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \ + !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h" +#endif /************************* Config internals ********************************/ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/default.h deleted file mode 100644 index d1e41c52e..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/default.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H -#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H - -/* - * Default arithmetic backend - */ -#include "sys.h" - -#ifdef SYS_AARCH64 -/* - * For AArch64, we currently we have one clean and one opt profile. - * We default to the opt profile. - * - * In the future, this may branch further depending on the microarchitecture. - */ -#include "aarch64/opt.h" -#endif /* SYS_AARCH64 */ - -#ifdef SYS_X86_64_AVX2 -/* - * For now, there's only one x86_64 profile, based on - * the AVX2 code from the Kyber repository. - * https://github.com/pq-crystals/kyber - */ -#include "x86_64/default.h" -#endif /* SYS_X86_64 */ - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.c index 0cfcc3e9e..318d0fc77 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.c @@ -9,11 +9,10 @@ #include "fips202.h" #include "fips202x4.h" #include "indcpa.h" -#include "ntt.h" #include "poly.h" -#include "polyvec.h" +#include "poly_k.h" #include "randombytes.h" -#include "rej_uniform.h" +#include "sampling.h" #include "symmetric.h" #include "arith_backend.h" @@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v, #define poly_permute_bitrev_to_custom \ MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom) -static INLINE void poly_permute_bitrev_to_custom(poly *data) +static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N]) __contract__( /* We don't specify that this should be a permutation, but only * that it does not change the bound established at the end of gen_matrix. */ - requires(memory_no_alias(data, sizeof(poly))) - requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N)) + requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q)) assigns(memory_slice(data, sizeof(poly))) - ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } + ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ /* Not static for benchmarking */ @@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed) { for (j = 0; j < MLKEM_K; j++) { - poly_permute_bitrev_to_custom(&a[i].vec[j]); + poly_permute_bitrev_to_custom(a[i].vec[j].coeffs); } } } diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.h index 2c4fda3c4..b4d5985bf 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.h @@ -8,7 +8,7 @@ #include #include "cbmc.h" #include "common.h" -#include "polyvec.h" +#include "poly_k.h" #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix) /************************************************* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/api.h new file mode 100644 index 000000000..0704f9dcd --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/api.h @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * Native arithmetic interface + * + * This header is primarily for documentation purposes. + * It should not be included by backend implementations. + * + * To ensure consistency with backends, the header will be + * included automatically after inclusion of the active + * backend, to ensure consistency of function signatures, + * and run sanity checks. + */ +#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H +#error \ + "The arithmetic backend API `mlkem/native/api.h` " \ + "should not be directly included. Please include the relevant " \ + "structure headers directly." +#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ +#define MLKEM_NATIVE_ARITH_NATIVE_API_H + +#include +#include "../common.h" + +/* + * This is the C<->native interface allowing for the drop-in of + * native code for performance critical arithmetic components of ML-KEM. + * + * A _backend_ is a specific implementation of (part of) this interface. + * + * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and + * implement `static inline xxx(...)` in the profile header. + * + * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can + * be set if there are native implementations for all of NTT, invNTT, and + * base multiplication, and allows the native implementation to use a + * custom order of polynomial coefficients in NTT domain -- the use of such + * custom order is not an implementation-detail since the public matrix + * is generated in NTT domain. In this case, a permutation function + * poly_permute_bitrev_to_custom() needs to be provided that permutes + * polynomials in NTT domain from bitreversed to the custom order. + */ + +/* + * Those functions are meant to be trivial wrappers around the chosen native + * implementation. The are static inline to avoid unnecessary calls. + * The macro before each declaration controls whether a native + * implementation is present. + */ + +#if defined(MLKEM_USE_NATIVE_NTT) +/************************************************* + * Name: ntt_native + * + * Description: Computes negacyclic number-theoretic transform (NTT) of + * a polynomial in place. + * + * The input polynomial is assumed to be in normal order. + * The output polynomial is in bitreversed order, or of a + * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial + **************************************************/ +static INLINE void ntt_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_NTT */ + +#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER) +/* + * This must only be set if NTT, invNTT, basemul, mulcache, and + * to/from byte stream conversions all have native implementations + * that are adapted to the custom order. + */ +#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) || \ + !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) || \ + !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +#error \ + "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \ +set if there are native implementations for NTT, invNTT, mulcache, basemul, \ +and to/from bytes conversions." +#endif + +/************************************************* + * Name: poly_permute_bitrev_to_custom + * + * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined, + * convert a polynomial in NTT domain from bitreversed + * order to the custom order output by the native NTT. + * + * This must only be defined if there is native code for + * all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache. + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial + * + **************************************************/ +static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ + +#if defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: intt_native + * + * Description: Computes inverse of negacyclic number-theoretic transform (NTT) + * of a polynomial in place. + * + * The input polynomial is in bitreversed order, or of a + * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * The output polynomial is assumed to be in normal order. + * + * Arguments: - uint16_t *a: pointer to in/output polynomial + **************************************************/ +static INLINE void intt_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_INTT */ + +#if defined(MLKEM_USE_NATIVE_POLY_REDUCE) +/************************************************* + * Name: poly_reduce_native + * + * Description: Applies modular reduction to all coefficients of a polynomial. + * + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial + **************************************************/ +static INLINE void poly_reduce_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ + +#if defined(MLKEM_USE_NATIVE_POLY_TOMONT) +/************************************************* + * Name: poly_tomont_native + * + * Description: Inplace conversion of all coefficients of a polynomial + * from normal domain to Montgomery domain + * + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial + **************************************************/ +static INLINE void poly_tomont_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ + +#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) +/************************************************* + * Name: poly_mulcache_compute_native + * + * Description: Compute multiplication cache for a polynomial + * in NTT domain. + * + * The purpose of the multiplication cache is to + * cache repeated computations required during a + * base multiplication of polynomials in NTT domain. + * The structure of the multiplication-cache is + * implementation defined. + * + * Arguments: INPUT: + * - poly: const pointer to input polynomial. + * This must be in NTT domain and inin bitreversed order, or of + * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * OUTPUT + * - cache: pointer to multiplication cache + **************************************************/ +static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2], + const int16_t poly[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ + +#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) +/************************************************* + * Name: poly_mulcache_compute_native + * + * Description: Compute multiplication of polynomials in NTT domain. + * + * Arguments: INPUT: + * - a: First polynomial operand. + * This must be in NTT domain and inin bitreversed order, or of + * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * - b: Second polynomial operand. + * As for a. + * - b_cache: Multiplication-cache for b. + * OUTPUT + * - r: Result of the base multiplication. This is again + * in NTT domain, and of the same order as a and b. + **************************************************/ +static INLINE void polyvec_basemul_acc_montgomery_cached_native( + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]); +#endif + +#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES) +/************************************************* + * Name: poly_tobytes_native + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - a: const pointer to input polynomial, + * with each coefficient in the range -Q+1 .. Q-1 + * OUTPUT + * - r: pointer to output byte array + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], + const int16_t a[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + +#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +/************************************************* + * Name: poly_frombytes_native + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - r: pointer to output polynomial in NTT domain + * OUTPUT + * - a: const pointer to input byte aray + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +static INLINE void poly_frombytes_native(int16_t a[MLKEM_N], + const uint8_t r[MLKEM_POLYBYTES]); +#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ + +#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM) +/************************************************* + * Name: rej_uniform_native + * + * Description: Run rejection sampling on uniform random bytes to generate + * uniform random integers mod q + * + * Arguments: - int16_t *r: pointer to output buffer + * - unsigned int len: requested number of 16-bit integers + * (uniform mod q). + * - const uint8_t *buf: pointer to input buffer + * (assumed to be uniform random bytes) + * - unsigned int buflen: length of input buffer in bytes. + * + * Return -1 if the native implementation does not support the input lengths. + * Otherwise, returns non-negative number of sampled 16-bit integers (at most + * len). + **************************************************/ +static INLINE int rej_uniform_native(int16_t *r, unsigned int len, + const uint8_t *buf, unsigned int buflen); +#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ + +#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/default.h new file mode 100644 index 000000000..f9fe4310a --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/default.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H +#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H + +/* + * Default arithmetic backend + */ +#include "../sys.h" + +#ifdef SYS_AARCH64 +/* + * For AArch64, we currently we have one clean and one opt profile. + * We default to the opt profile. + * + * In the future, this may branch further depending on the microarchitecture. + */ +#include "aarch64/opt.h" +#endif /* SYS_AARCH64 */ + +#ifdef SYS_X86_64_AVX2 +/* + * For now, there's only one x86_64 profile, based on + * the AVX2 code from the Kyber repository. + * https://github.com/pq-crystals/kyber + */ +#include "x86_64/default.h" +#endif /* SYS_X86_64 */ + +#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/README.md similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/README.md rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/README.md diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/default.h similarity index 90% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/default.h rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/default.h index 592e8996d..73f53dc13 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/default.h @@ -19,6 +19,6 @@ /* Filename of the C backend implementation. * This is not inlined here because this header is included in assembly * files as well. */ -#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "x86_64/src/default_impl.h" +#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/x86_64/src/default_impl.h" #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/align.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/align.h similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/align.h rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/align.h diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/arith_native_x86_64.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/arith_native_x86_64.h similarity index 91% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/arith_native_x86_64.h rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/arith_native_x86_64.h index 25e00a930..acde977ad 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/arith_native_x86_64.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/arith_native_x86_64.h @@ -5,11 +5,10 @@ #ifndef MLKEM_X86_64_NATIVE_H #define MLKEM_X86_64_NATIVE_H -#include "common.h" +#include "../../../common.h" #include #include -#include "polyvec.h" #include "consts.h" #define REJ_UNIFORM_AVX_NBLOCKS 3 /* See MLKEM_GEN_MATRIX_NBLOCKS */ @@ -44,8 +43,9 @@ void basemul_avx2(__m256i *r, const __m256i *a, const __m256i *b, #define polyvec_basemul_acc_montgomery_cached_avx2 \ MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_avx2) void polyvec_basemul_acc_montgomery_cached_avx2( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache); + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]); #define ntttobytes_avx2 MLKEM_NAMESPACE(ntttobytes_avx2) void ntttobytes_avx2(uint8_t *r, const __m256i *a, const __m256i *qdata); diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.S rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S index b97840e70..5fdc3d0a0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S @@ -6,7 +6,7 @@ // Implementation from Kyber reference repository // https://github.com/pq-crystals/kyber/blob/main/avx2 -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) #include "consts.h" diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.c similarity index 51% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.c rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.c index 5f9ae99c8..970938306 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.c @@ -3,46 +3,46 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) -#include "poly.h" -#include "polyvec.h" - #include "arith_native_x86_64.h" #include "consts.h" -static void poly_basemul_montgomery_avx2(poly *r, const poly *a, const poly *b) +static void poly_basemul_montgomery_avx2(int16_t r[MLKEM_N], + const int16_t a[MLKEM_N], + const int16_t b[MLKEM_N]) { - basemul_avx2((__m256i *)r->coeffs, (const __m256i *)a->coeffs, - (const __m256i *)b->coeffs, qdata.vec); + basemul_avx2((__m256i *)r, (const __m256i *)a, (const __m256i *)b, qdata.vec); } /* * Implementation from Kyber reference repository * https://github.com/pq-crystals/kyber/blob/main/avx2 */ -static void poly_add_avx2(poly *r, const poly *a, const poly *b) +static void poly_add_avx2(int16_t r[MLKEM_N], const int16_t a[MLKEM_N], + const int16_t b[MLKEM_N]) { unsigned i; __m256i f0, f1; for (i = 0; i < MLKEM_N; i += 16) { - f0 = _mm256_load_si256((const __m256i *)&a->coeffs[i]); - f1 = _mm256_load_si256((const __m256i *)&b->coeffs[i]); + f0 = _mm256_load_si256((const __m256i *)&a[i]); + f1 = _mm256_load_si256((const __m256i *)&b[i]); f0 = _mm256_add_epi16(f0, f1); - _mm256_store_si256((__m256i *)&r->coeffs[i], f0); + _mm256_store_si256((__m256i *)&r[i], f0); } } -void polyvec_basemul_acc_montgomery_cached_avx2(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) +void polyvec_basemul_acc_montgomery_cached_avx2( + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]) { unsigned i; - poly t; + int16_t t[MLKEM_N] ALIGN; /* TODO: Use mulcache for AVX2. So far, it is unused. */ ((void)b_cache); @@ -50,11 +50,11 @@ void polyvec_basemul_acc_montgomery_cached_avx2(poly *r, const polyvec *a, /* Coefficient-wise bound of each basemul is 2q. * Since we are accumulating at most 4 times, the * overall bound is 8q < INT16_MAX. */ - poly_basemul_montgomery_avx2(r, &a->vec[0], &b->vec[0]); + poly_basemul_montgomery_avx2(r, &a[0], &b[0]); for (i = 1; i < MLKEM_K; i++) { - poly_basemul_montgomery_avx2(&t, &a->vec[i], &b->vec[i]); - poly_add_avx2(r, r, &t); + poly_basemul_montgomery_avx2(t, &a[i * MLKEM_N], &b[i * MLKEM_N]); + poly_add_avx2(r, r, t); } } diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.c similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.c rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.c index 86a0835ef..568752ae8 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.c @@ -8,7 +8,7 @@ * https://github.com/pq-crystals/kyber/blob/main/avx2/consts.c */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.h similarity index 97% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.h rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.h index 00c415952..e2846b609 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.h @@ -11,7 +11,7 @@ #ifndef CONSTS_H #define CONSTS_H -#include "common.h" +#include "../../../common.h" #define AVX2_BACKEND_DATA_OFFSET_16XQ 0 #define AVX2_BACKEND_DATA_OFFSET_16XQINV 16 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/default_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/default_impl.h similarity index 62% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/default_impl.h rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/default_impl.h index 029111c17..3683361e2 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/default_impl.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/default_impl.h @@ -12,8 +12,7 @@ #include -#include "poly.h" -#include "polyvec.h" +#include "../../../params.h" #include "arith_native_x86_64.h" #define MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER @@ -28,9 +27,9 @@ #define MLKEM_USE_NATIVE_POLY_TOBYTES #define MLKEM_USE_NATIVE_POLY_FROMBYTES -static INLINE void poly_permute_bitrev_to_custom(poly *data) +static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N]) { - nttunpack_avx2((__m256i *)(data->coeffs), qdata.vec); + nttunpack_avx2((__m256i *)(data), qdata.vec); } static INLINE int rej_uniform_native(int16_t *r, unsigned int len, @@ -45,27 +44,28 @@ static INLINE int rej_uniform_native(int16_t *r, unsigned int len, return (int)rej_uniform_avx2(r, buf); } -static INLINE void ntt_native(poly *data) +static INLINE void ntt_native(int16_t data[MLKEM_N]) { ntt_avx2((__m256i *)data, qdata.vec); } -static INLINE void intt_native(poly *data) +static INLINE void intt_native(int16_t data[MLKEM_N]) { invntt_avx2((__m256i *)data, qdata.vec); } -static INLINE void poly_reduce_native(poly *data) +static INLINE void poly_reduce_native(int16_t data[MLKEM_N]) { - reduce_avx2((__m256i *)data->coeffs, qdata.vec); + reduce_avx2((__m256i *)data, qdata.vec); } -static INLINE void poly_tomont_native(poly *data) +static INLINE void poly_tomont_native(int16_t data[MLKEM_N]) { - tomont_avx2((__m256i *)data->coeffs, qdata.vec); + tomont_avx2((__m256i *)data, qdata.vec); } -static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y) +static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2], + const int16_t y[MLKEM_N]) { /* AVX2 backend does not use mulcache */ ((void)y); @@ -73,22 +73,23 @@ static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y) } static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache) + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]) { polyvec_basemul_acc_montgomery_cached_avx2(r, a, b, b_cache); } static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a) + const int16_t a[MLKEM_N]) { - ntttobytes_avx2(r, (const __m256i *)a->coeffs, qdata.vec); + ntttobytes_avx2(r, (const __m256i *)a, qdata.vec); } -static INLINE void poly_frombytes_native(poly *r, +static INLINE void poly_frombytes_native(int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYBYTES]) { - nttfrombytes_avx2((__m256i *)r->coeffs, a, qdata.vec); + nttfrombytes_avx2((__m256i *)r, a, qdata.vec); } #endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S similarity index 98% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.S rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S index 134bd4f71..3f013a5fa 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S @@ -11,7 +11,7 @@ // in [0,1,...,q-1] rather than [0,1,...,q], matching the // semantics of poly_reduce(). -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) #include "consts.h" diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.inc b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.inc similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.inc rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.inc diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/intt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S index 6b1d78ef2..7b1f22624 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/intt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S @@ -9,7 +9,7 @@ * Changes to placement of modular reductions have * been made to simplify reasoning of non-overflow */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/ntt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S index e8bf7894b..5d928b4cc 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/ntt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S @@ -6,7 +6,7 @@ // Implementation from Kyber reference repository // https://github.com/pq-crystals/kyber/blob/main/avx2 -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) #include "consts.h" diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_avx2.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_avx2.c similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_avx2.c rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_avx2.c index 54037a0df..adf2d338b 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_avx2.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_avx2.c @@ -8,7 +8,7 @@ * https://github.com/pq-crystals/kyber/blob/main/avx2 */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_table.c similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_table.c rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_table.c index 9bbc47146..e95fd9e79 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_table.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_table.c @@ -8,7 +8,7 @@ * Do not modify it directly. */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.S rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S index 5e708748a..9bcd04896 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S @@ -6,7 +6,7 @@ // Implementation from Kyber reference repository // https://github.com/pq-crystals/kyber/blob/main/avx2 -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.inc b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.inc similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.inc rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.inc diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/x86_64_zetas.i b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/x86_64_zetas.i similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/x86_64_zetas.i rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/x86_64_zetas.i diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.c deleted file mode 100644 index 3651c8da9..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) - -#include -#include "arith_backend.h" -#include "debug.h" -#include "ntt.h" -#include "reduce.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) -#define ntt_layer MLKEM_NAMESPACE(ntt_layer) -#define invntt_layer MLKEM_NAMESPACE(invntt_layer) -/* End of static namespacing */ - -#if !defined(MLKEM_USE_NATIVE_NTT) -/* - * Computes a block CT butterflies with a fixed twiddle factor, - * using Montgomery multiplication. - * Parameters: - * - r: Pointer to base of polynomial (_not_ the base of butterfly block) - * - root: Twiddle factor to use for the butterfly. This must be in - * Montgomery form and signed canonical. - * - start: Offset to the beginning of the butterfly block - * - len: Index difference between coefficients subject to a butterfly - * - bound: Ghost variable describing coefficient bound: Prior to `start`, - * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, - * they must be bound by `bound`. - * When this function returns, output coefficients in the index range - * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. - * Example: - * - start=8, len=4 - * This would compute the following four butterflies - * 8 -- 12 - * 9 -- 13 - * 10 -- 14 - * 11 -- 15 - * - start=4, len=2 - * This would compute the following two butterflies - * 4 -- 6 - * 5 -- 7 - */ -static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, - unsigned start, unsigned len, int bound) -__contract__( - requires(start < MLKEM_N) - requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) - requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) - requires(-HALF_Q < zeta && zeta < HALF_Q) - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) - requires(array_abs_bound(r, start, MLKEM_N, bound)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) - ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) -{ - /* `bound` is a ghost variable only needed in the CBMC specification */ - unsigned j; - ((void)bound); - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - /* - * Coefficients are updated in strided pairs, so the bounds for the - * intermediate states alternate twice between the old and new bound - */ - invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j, start + len, bound)) - invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) - { - int16_t t; - t = fqmul(r[j + len], zeta); - r[j + len] = r[j] - t; - r[j] = r[j] + t; - } -} - -/* - *Compute one layer of forward NTT - * Parameters: - * - r: Pointer to base of polynomial - * - len: Stride of butterflies in this layer. - * - layer: Ghost variable indicating which layer is being applied. - * Must match `len` via `len == MLKEM_N >> layer`. - * Note: `len` could be dropped and computed in the function, but - * we are following the structure of the reference NTT from the - * official Kyber implementation here, merely adding `layer` as - * a ghost variable for the specifications. - */ -static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) - requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable only needed in the CBMC specification */ - ((void)layer); - /* Twiddle factors for layer n start at index 2^(layer-1) */ - k = MLKEM_N / (2 * len); - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(start < MLKEM_N + 2 * len) - invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) - invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) - invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) - { - int16_t zeta = zetas[k++]; - ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); - } -} - -/* - * Compute full forward NTT - * NOTE: This particular implementation satisfies a much tighter - * bound on the output coefficients (5*q) than the contractual one (8*q), - * but this is not needed in the calling code. Should we change the - * base multiplication strategy to require smaller NTT output bounds, - * the proof may need strengthening. - */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - unsigned len, layer; - int16_t *r; - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - r = p->coeffs; - - for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) - __loop__( - invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) - invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) - { - ntt_layer(r, len, layer); - } - - /* Check the stronger bound */ - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_NTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - ntt_native(p); - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_NTT */ - -#if !defined(MLKEM_USE_NATIVE_INTT) - -/* Compute one layer of inverse NTT */ -static void invntt_layer(int16_t *r, unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) - requires(len == (1 << (8 - layer))) - requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable used only in the specification */ - ((void)layer); - k = MLKEM_N / len - 1; - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - invariant(start <= MLKEM_N && k <= 127) - /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ - invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) - { - unsigned j; - int16_t zeta = zetas[k--]; - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - invariant(start <= MLKEM_N && k <= 127) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - int16_t t = r[j]; - r[j] = barrett_reduce(t + r[j + len]); - r[j + len] = r[j + len] - t; - r[j + len] = fqmul(r[j + len], zeta); - } - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - /* - * Scale input polynomial to account for Montgomery factor - * and NTT twist. This also brings coefficients down to - * absolute value < MLKEM_Q. - */ - unsigned j, len, layer; - const int16_t f = 1441; - int16_t *r = p->coeffs; - - for (j = 0; j < MLKEM_N; j++) - __loop__( - invariant(j <= MLKEM_N) - invariant(array_abs_bound(r, 0, j, MLKEM_Q))) - { - r[j] = fqmul(r[j], f); - } - - /* Run the invNTT layers */ - for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) - __loop__( - invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - invntt_layer(p->coeffs, len, layer); - } - - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - intt_native(p); - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -{ - int32_t t0, t1; - debug_assert_bound(a, 2, 0, UINT12_LIMIT); - - t0 = (int32_t)a[1] * b_cached; - t0 += (int32_t)a[0] * b[0]; - t1 = (int32_t)a[0] * b[1]; - t1 += (int32_t)a[1] * b[0]; - - /* |ti| < 2 * q * 2^15 */ - r[0] = montgomery_reduce(t0); - r[1] = montgomery_reduce(t1); - - debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); -} - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt) -int empty_cu_ntt; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.h deleted file mode 100644 index 4e80d3ab3..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef NTT_H -#define NTT_H -#include "common.h" - -#include -#include "cbmc.h" -#include "poly.h" -#include "reduce.h" - -#define zetas MLKEM_NAMESPACE(zetas) -extern const int16_t zetas[128]; - -#define poly_ntt MLKEM_NAMESPACE(poly_ntt) -/************************************************* - * Name: poly_ntt - * - * Description: Computes negacyclic number-theoretic transform (NTT) of - * a polynomial in place. - * - * The input is assumed to be in normal order and - * coefficient-wise bound by MLKEM_Q in absolute value. - * - * The output polynomial is in bitreversed order, and - * coefficient-wise bound by NTT_BOUND in absolute value. - * - * (NOTE: Sometimes the input to the NTT is actually smaller, - * which gives better bounds.) - * - * Arguments: - poly *p: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) -); - -#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) -/************************************************* - * Name: poly_invntt_tomont - * - * Description: Computes inverse of negacyclic number-theoretic transform (NTT) - * of a polynomial in place; - * inputs assumed to be in bitreversed order, output in normal - * order - * - * The input is assumed to be in bitreversed order, and can - * have arbitrary coefficients in int16_t. - * - * The output polynomial is in normal order, and - * coefficient-wise bound by INVNTT_BOUND in absolute value. - * - * Arguments: - uint16_t *a: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) -); - -#define basemul_cached MLKEM_NAMESPACE(basemul_cached) -/************************************************************ - * Name: basemul_cached - * - * Description: Computes a representative modulo q of - * (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536 - * - * If b_cached is b1*zeta, this represents the - * product of (a0 + a1*X) and (b0 + b1*X) in - * Fq[X]/(X^2 - zeta). - * - * Arguments: - r: Pointer to output polynomial - * Upon return, coefficients are bound by - * 2*MLKEM_Q in absolute value. - * - a: Pointer to first input polynomial - * Every coefficient must be in [0..4095] - * - b: Pointer to second input polynomial - * Can have arbitrary int16_t coefficients - * - b_cached: Some precomputed value, typically derived from - * b1 and a twiddle factor. Can be an arbitary int16_t. - ************************************************************/ -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -__contract__( - requires(memory_no_alias(r, 2 * sizeof(int16_t))) - requires(memory_no_alias(a, 2 * sizeof(int16_t))) - requires(memory_no_alias(b, 2 * sizeof(int16_t))) - requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) - assigns(memory_slice(r, 2 * sizeof(int16_t))) - ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)) -); - -#endif /* NTT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/params.h index 57ea4c8ba..7f6c12625 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/params.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/params.h @@ -18,6 +18,7 @@ #define MLKEM_N 256 #define MLKEM_Q 3329 #define UINT12_LIMIT 4096 +#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define MLKEM_SSBYTES 32 /* size in bytes of shared key */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.c index 7483ebf6d..e8a2e2c6e 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.c @@ -8,388 +8,246 @@ #include #include #include "arith_backend.h" -#include "cbd.h" #include "cbmc.h" #include "debug.h" #include "fips202x4.h" -#include "ntt.h" #include "poly.h" -#include "reduce.h" +#include "sampling.h" #include "symmetric.h" #include "verify.h" -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 16))) - { - t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); - } - - r[i * 4] = t[0] | (t[1] << 4); - r[i * 4 + 1] = t[2] | (t[3] << 4); - r[i * 4 + 2] = t[4] | (t[5] << 4); - r[i * 4 + 3] = t[6] | (t[7] << 4); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - for (j = 0; j < MLKEM_N / 4; j++) - __loop__(invariant(j <= MLKEM_N / 4)) - { - unsigned k; - uint16_t t[4]; - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(forall(r, 0, k, t[r] < (1u << 10)))) - { - t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); - } - - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 10-bit in size. - */ - r[5 * j + 0] = (t[0] >> 0) & 0xFF; - r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); - r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); - r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); - r[5 * j + 4] = (t[3] >> 2); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) - { - r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); - r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) +#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) +#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) +#define fqmul MLKEM_NAMESPACE(fqmul) +#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) +#define basemul_cached MLKEM_NAMESPACE(basemul_cached) +#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) +#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) +#define ntt_layer MLKEM_NAMESPACE(ntt_layer) +#define invntt_layer MLKEM_NAMESPACE(invntt_layer) +/* End of static namespacing */ + +/************************************************* + * Name: cast_uint16_to_int16 + * + * Description: Cast uint16 value to int16 + * + * Returns: + * input x in 0 .. 32767: returns value unchanged + * input x in 32768 .. 65535: returns (x - 65536) + **************************************************/ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "conversion" +#endif +ALWAYS_INLINE +static INLINE int16_t cast_uint16_to_int16(uint16_t x) { - unsigned j; - for (j = 0; j < MLKEM_N / 4; j++) - __loop__( - invariant(j <= MLKEM_N / 4) - invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[4]; - uint8_t const *base = &a[5 * j]; - - t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); - t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); - t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); - - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) - { - r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + /* + * PORTABILITY: This relies on uint16_t -> int16_t + * being implemented as the inverse of int16_t -> uint16_t, + * which is implementation-defined (C99 6.3.1.3 (3)) + * CBMC (correctly) fails to prove this conversion is OK, + * so we have to suppress that check here + */ + return (int16_t)x; } -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ +#ifdef CBMC +#pragma CPROVER check pop +#endif -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +/************************************************* + * Name: montgomery_reduce_generic + * + * Description: Generic Montgomery reduction; given a 32-bit integer a, computes + * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 + * + * Arguments: - int32_t a: input integer to be reduced + * + * Returns: integer congruent to a * R^-1 modulo q, with absolute value + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 + * + **************************************************/ +ALWAYS_INLINE +static INLINE int16_t montgomery_reduce_generic(int32_t a) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ + const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 32))) - { - t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); - } + /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ + const uint16_t a_reduced = a & UINT16_MAX; + const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC, and use array indexing into - * r rather than pointer-arithmetic to simplify verification - */ - r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); - r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); - r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); - r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); - r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); - } -} + /* Lift to signed canonical representative mod 2^16. */ + const int16_t t = cast_uint16_to_int16(a_inverted); -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + int32_t r = a - ((int32_t)t * MLKEM_Q); + /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - for (j = 0; j < MLKEM_N / 8; j++) - __loop__(invariant(j <= MLKEM_N / 8)) - { - unsigned k; - uint16_t t[8]; - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(forall(r, 0, k, t[r] < (1u << 11)))) - { - t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); - } + /* + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + r = r >> 16; + /* Bounds: |r >> 16| <= ceil(|r| / 2^16) + * <= ceil(|a| / 2^16 + MLKEM_Q / 2) + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * + * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) + */ - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 11-bit in size. - */ - r[11 * j + 0] = (t[0] >> 0) & 0xFF; - r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); - r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); - r[11 * j + 3] = (t[2] >> 2) & 0xFF; - r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); - r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); - r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); - r[11 * j + 7] = (t[5] >> 1) & 0xFF; - r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); - r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); - r[11 * j + 10] = (t[7] >> 3); - } + return (int16_t)r; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +/************************************************* + * Name: montgomery_reduce + * + * Description: Montgomery reduction + * + * Arguments: - int32_t a: input integer to be reduced + * Must be smaller than 2 * 2^12 * 2^15 in absolute value. + * + * Returns: integer congruent to a * R^-1 modulo q, + * smaller than 2 * q in absolute value. + **************************************************/ +static INLINE int16_t montgomery_reduce(int32_t a) +__contract__( + requires(a > -(2 * UINT12_LIMIT * 32768)) + requires(a < (2 * UINT12_LIMIT * 32768)) + ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) +) { - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - uint8_t t[8]; - const unsigned offset = i * 5; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC and unwind loop for ease - * of proof. - */ - - /* - * Decompress 5 8-bit bytes (so 40 bits) into - * 8 5-bit values stored in t[] - */ - t[0] = 0x1F & (a[offset + 0] >> 0); - t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); - t[2] = 0x1F & (a[offset + 1] >> 2); - t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); - t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); - t[5] = 0x1F & (a[offset + 3] >> 1); - t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); - t[7] = 0x1F & (a[offset + 4] >> 3); - - /* and copy to the correct slice in r[] */ - for (j = 0; j < 8; j++) - __loop__( - invariant(j <= 8 && i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + int16_t res; + debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); + + res = montgomery_reduce_generic(a); + /* Bounds: + * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 + * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 + * < 2 * MLKEM_Q */ + + debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); + return res; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: fqmul + * + * Description: Montgomery multiplication modulo q=3329 + * + * Arguments: - int16_t a: first factor + * Can be any int16_t. + * - int16_t b: second factor. + * Must be signed canonical (abs value <(q+1)/2) + * + * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and + * smaller than q in absolute value. + * + **************************************************/ +static INLINE int16_t fqmul(int16_t a, int16_t b) +__contract__( + requires(b > -HALF_Q) + requires(b < HALF_Q) + ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) +) { - unsigned j; - for (j = 0; j < MLKEM_N / 8; j++) - __loop__( - invariant(j <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[8]; - uint8_t const *base = &a[11 * j]; - t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); - t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | - ((uint16_t)base[4] << 10)); - t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); - t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); - t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | - ((uint16_t)base[8] << 9)); - t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); - t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); - - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) - { - r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); - } - } + int16_t res; + debug_assert_abs_bound(&b, 1, HALF_Q); + + res = montgomery_reduce((int32_t)a * (int32_t)b); + /* Bounds: + * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 + * < MLKEM_Q + */ - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + debug_assert_abs_bound(&res, 1, MLKEM_Q); + return res; } -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ - -#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: barrett_reduce + * + * Description: Barrett reduction; given a 16-bit integer a, computes + * centered representative congruent to a mod q in + * {-(q-1)/2,...,(q-1)/2} + * + * Arguments: - int16_t a: input integer to be reduced + * + * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. + **************************************************/ +static INLINE int16_t barrett_reduce(int16_t a) +__contract__( + ensures(return_value > -HALF_Q && return_value < HALF_Q) +) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 2; i++) - __loop__(invariant(i <= MLKEM_N / 2)) - { - const uint16_t t0 = a->coeffs[2 * i]; - const uint16_t t1 = a->coeffs[2 * i + 1]; - /* - * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of - * significant data, so these can be packed into 24 bits or exactly - * 3 bytes, as follows. - */ - - /* Least significant bits 0 - 7 of t0. */ - r[3 * i + 0] = t0 & 0xFF; - - /* - * Most significant bits 8 - 11 of t0 become the least significant - * nibble of the second byte. The least significant 4 bits - * of t1 become the upper nibble of the second byte. - */ - r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + /* + * To divide by MLKEM_Q using Barrett multiplication, the "magic number" + * multiplier is round_to_nearest(2**26/MLKEM_Q) + */ + const int BPOWER = 26; + const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - /* Bits 4 - 11 of t1 become the third byte. */ - r[3 * i + 2] = t1 >> 4; - } -} -#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -{ - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - poly_tobytes_native(r, a); -} -#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + /* + * Compute round_to_nearest(a/MLKEM_Q) using the multiplier + * above and shift by BPOWER places. + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; -#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) - { - const uint8_t t0 = a[3 * i + 0]; - const uint8_t t1 = a[3 * i + 1]; - const uint8_t t2 = a[3 * i + 2]; - r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); - r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); - } + /* + * t is in -10 .. +10, so we need 32-bit math to + * evaluate t * MLKEM_Q and the subsequent subtraction + */ + int16_t res = (int16_t)(a - t * MLKEM_Q); - /* Note that the coefficients are not canonical */ - debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); -} -#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - poly_frombytes_native(r, a); + debug_assert_abs_bound(&res, 1, HALF_Q); + return res; } -#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ - -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], + int16_t b_cached) +__contract__( + requires(memory_no_alias(r, 2 * sizeof(int16_t))) + requires(memory_no_alias(a, 2 * sizeof(int16_t))) + requires(memory_no_alias(b, 2 * sizeof(int16_t))) + requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) + assigns(memory_slice(r, 2 * sizeof(int16_t))) + ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))) { - unsigned i; -#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) -#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" -#endif + int32_t t0, t1; + debug_assert_bound(a, 2, 0, UINT12_LIMIT); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - for (j = 0; j < 8; j++) - __loop__( - invariant(i < MLKEM_N / 8 && j <= 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - /* Prevent the compiler from recognizing this as a bit selection */ - uint8_t mask = value_barrier_u8(1u << j); - r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); - } - } - debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); -} + t0 = (int32_t)a[1] * b_cached; + t0 += (int32_t)a[0] * b[0]; + t1 = (int32_t)a[0] * b[1]; + t1 += (int32_t)a[1] * b[0]; -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* |ti| < 2 * q * 2^15 */ + r[0] = montgomery_reduce(t0); + r[1] = montgomery_reduce(t1); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - msg[i] = 0; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8)) - { - uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); - msg[i] |= t << j; - } - } + debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); } MLKEM_NATIVE_INTERNAL_API @@ -434,12 +292,46 @@ void poly_tomont(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_tomont(poly *r) { - poly_tomont_native(r); + poly_tomont_native(r->coeffs); debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) +/************************************************************ + * Name: scalar_signed_to_unsigned_q + * + * Description: converts signed polynomial coefficient + * from signed (-3328 .. 3328) form to + * unsigned form (0 .. 3328). + * + * Note: Cryptographic constant time implementation + * + * Examples: 0 -> 0 + * 1 -> 1 + * 3328 -> 3328 + * -1 -> 3328 + * -2 -> 3327 + * -3328 -> 1 + * + * Arguments: c: signed coefficient to be converted + ************************************************************/ +static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) +__contract__( + requires(c > -MLKEM_Q && c < MLKEM_Q) + ensures(return_value >= 0 && return_value < MLKEM_Q) + ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) +{ + debug_assert_abs_bound(&c, 1, MLKEM_Q); + + /* Add Q if c is negative, but in constant time */ + c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); + + /* and therefore cast to uint16_t is safe. */ + debug_assert_bound(&c, 1, 0, MLKEM_Q); + return (uint16_t)c; +} + MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { @@ -461,7 +353,7 @@ void poly_reduce(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { - poly_reduce_native(r); + poly_reduce_native(r->coeffs); debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ @@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a) MLKEM_NATIVE_INTERNAL_API void poly_mulcache_compute(poly_mulcache *x, const poly *a) { - poly_mulcache_compute_native(x, a); + poly_mulcache_compute_native(x->coeffs, a->coeffs); /* Omitting bounds assertion since native implementations may * decide not to use a mulcache. Note that the C backend implementation * of poly_basemul_montgomery_cached() does still include the check. */ } #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ +#if !defined(MLKEM_USE_NATIVE_NTT) +/* + * Computes a block CT butterflies with a fixed twiddle factor, + * using Montgomery multiplication. + * Parameters: + * - r: Pointer to base of polynomial (_not_ the base of butterfly block) + * - root: Twiddle factor to use for the butterfly. This must be in + * Montgomery form and signed canonical. + * - start: Offset to the beginning of the butterfly block + * - len: Index difference between coefficients subject to a butterfly + * - bound: Ghost variable describing coefficient bound: Prior to `start`, + * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, + * they must be bound by `bound`. + * When this function returns, output coefficients in the index range + * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. + * Example: + * - start=8, len=4 + * This would compute the following four butterflies + * 8 -- 12 + * 9 -- 13 + * 10 -- 14 + * 11 -- 15 + * - start=4, len=2 + * This would compute the following two butterflies + * 4 -- 6 + * 5 -- 7 + */ +static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, + unsigned start, unsigned len, int bound) +__contract__( + requires(start < MLKEM_N) + requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) + requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) + requires(-HALF_Q < zeta && zeta < HALF_Q) + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) + requires(array_abs_bound(r, start, MLKEM_N, bound)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) + ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) +{ + /* `bound` is a ghost variable only needed in the CBMC specification */ + unsigned j; + ((void)bound); + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + /* + * Coefficients are updated in strided pairs, so the bounds for the + * intermediate states alternate twice between the old and new bound + */ + invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j, start + len, bound)) + invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) + { + int16_t t; + t = fqmul(r[j + len], zeta); + r[j + len] = r[j] - t; + r[j] = r[j] + t; + } +} + +/* + *Compute one layer of forward NTT + * Parameters: + * - r: Pointer to base of polynomial + * - len: Stride of butterflies in this layer. + * - layer: Ghost variable indicating which layer is being applied. + * Must match `len` via `len == MLKEM_N >> layer`. + * Note: `len` could be dropped and computed in the function, but + * we are following the structure of the reference NTT from the + * official Kyber implementation here, merely adding `layer` as + * a ghost variable for the specifications. + */ +static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) + requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable only needed in the CBMC specification */ + ((void)layer); + /* Twiddle factors for layer n start at index 2^(layer-1) */ + k = MLKEM_N / (2 * len); + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(start < MLKEM_N + 2 * len) + invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) + invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) + invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) + { + int16_t zeta = zetas[k++]; + ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); + } +} + +/* + * Compute full forward NTT + * NOTE: This particular implementation satisfies a much tighter + * bound on the output coefficients (5*q) than the contractual one (8*q), + * but this is not needed in the calling code. Should we change the + * base multiplication strategy to require smaller NTT output bounds, + * the proof may need strengthening. + */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + unsigned len, layer; + int16_t *r; + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + r = p->coeffs; + + for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) + __loop__( + invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) + invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) + { + ntt_layer(r, len, layer); + } + + /* Check the stronger bound */ + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_NTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + ntt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_NTT */ + +#if !defined(MLKEM_USE_NATIVE_INTT) + +/* Compute one layer of inverse NTT */ +static void invntt_layer(int16_t *r, unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) + requires(len == (1 << (8 - layer))) + requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable used only in the specification */ + ((void)layer); + k = MLKEM_N / len - 1; + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + invariant(start <= MLKEM_N && k <= 127) + /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ + invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) + { + unsigned j; + int16_t zeta = zetas[k--]; + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + invariant(start <= MLKEM_N && k <= 127) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + int16_t t = r[j]; + r[j] = barrett_reduce(t + r[j + len]); + r[j + len] = r[j + len] - t; + r[j + len] = fqmul(r[j + len], zeta); + } + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + /* + * Scale input polynomial to account for Montgomery factor + * and NTT twist. This also brings coefficients down to + * absolute value < MLKEM_Q. + */ + unsigned j, len, layer; + const int16_t f = 1441; + int16_t *r = p->coeffs; + + for (j = 0; j < MLKEM_N; j++) + __loop__( + invariant(j <= MLKEM_N) + invariant(array_abs_bound(r, 0, j, MLKEM_Q))) + { + r[j] = fqmul(r[j], f); + } + + /* Run the invNTT layers */ + for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) + __loop__( + invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + invntt_layer(p->coeffs, len, layer); + } + + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_INTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + intt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_INTT */ + #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.h index 6a14c785d..cb0d67c1a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.h @@ -9,7 +9,7 @@ #include #include "cbmc.h" #include "common.h" -#include "reduce.h" +#include "debug.h" #include "verify.h" /* Absolute exclusive upper bound for the output of the inverse NTT */ @@ -18,6 +18,9 @@ /* Absolute exclusive upper bound for the output of the forward NTT */ #define NTT_BOUND (8 * MLKEM_Q) +#define zetas MLKEM_NAMESPACE(zetas) +extern const int16_t zetas[128]; + /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1] @@ -38,520 +41,6 @@ typedef struct int16_t coeffs[MLKEM_N >> 1]; } poly_mulcache; -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) -#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) -#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) -#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) -#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) -#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) -#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) -#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) -#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) -#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) -/* End of static namespacing */ - -/************************************************************ - * Name: scalar_compress_d1 - * - * Description: Computes round(u * 2 / q) - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 1. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d1(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 2) - ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) -{ - uint32_t d0 = u << 1; - d0 *= 645083; - d0 += 1u << 30; - d0 >>= 31; - return d0; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_compress_d4 - * - * Description: Computes round(u * 16 / q) % 16 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d4(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 16) - ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) -{ - uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ - return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d4 - * - * Description: Computes round(u * q / 16) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d4(uint32_t u) -__contract__( - requires(0 <= u && u < 16) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 8) / 16; } - -/************************************************************ - * Name: scalar_compress_d5 - * - * Description: Computes round(u * 32 / q) % 32 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d5(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 32) - ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) -{ - uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ - return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d5 - * - * Description: Computes round(u * q / 32) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo 32 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d5(uint32_t u) -__contract__( - requires(0 <= u && u < 32) - ensures(return_value <= MLKEM_Q - 1) -) { return ((u * MLKEM_Q) + 16) / 32; } - -/************************************************************ - * Name: scalar_compress_d10 - * - * Description: Computes round(u * 2**10 / q) % 2**10 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d10(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 10)) - ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) -{ - uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x3FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d10 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d10(uint32_t u) -__contract__( - requires(0 <= u && u < 1024) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 512) / 1024; } - -/************************************************************ - * Name: scalar_compress_d11 - * - * Description: Computes round(u * 2**11 / q) % 2**11 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 11. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d11(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 11)) - ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) -{ - uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x7FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d11 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d11(uint32_t u) -__contract__( - requires(0 <= u && u < 2048) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 1024) / 2048; } - -/************************************************************ - * Name: scalar_signed_to_unsigned_q - * - * Description: converts signed polynomial coefficient - * from signed (-3328 .. 3328) form to - * unsigned form (0 .. 3328). - * - * Note: Cryptographic constant time implementation - * - * Examples: 0 -> 0 - * 1 -> 1 - * 3328 -> 3328 - * -1 -> 3328 - * -2 -> 3327 - * -3328 -> 1 - * - * Arguments: c: signed coefficient to be converted - ************************************************************/ -static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) -__contract__( - requires(c > -MLKEM_Q && c < MLKEM_Q) - ensures(return_value >= 0 && return_value < MLKEM_Q) - ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) -{ - debug_assert_abs_bound(&c, 1, MLKEM_Q); - - /* Add Q if c is negative, but in constant time */ - c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); - - /* and therefore cast to uint16_t is safe. */ - debug_assert_bound(&c, 1, 0, MLKEM_Q); - return (uint16_t)c; -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ - (MLKEM_K == 2 || MLKEM_K == 3) -#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) -/************************************************* - * Name: poly_compress_d4 - * - * Description: Compression (4 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); - -#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) -/************************************************* - * Name: poly_compress_d10 - * - * Description: Compression (10 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); - -#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) -/************************************************* - * Name: poly_decompress_d4 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); - -#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) -/************************************************* - * Name: poly_decompress_d10 - * - * Description: De-serialization and subsequent decompression (10 bits) of a - * polynomial; approximate inverse of poly_compress_d10 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) -/************************************************* - * Name: poly_compress_d5 - * - * Description: Compression (5 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); - -#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) -/************************************************* - * Name: poly_compress_d11 - * - * Description: Compression (11 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); - -#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) -/************************************************* - * Name: poly_decompress_d5 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); - -#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) -/************************************************* - * Name: poly_decompress_d11 - * - * Description: De-serialization and subsequent decompression (11 bits) of a - * polynomial; approximate inverse of poly_compress_d11 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ - */ - -#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) -/************************************************* - * Name: poly_tobytes - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - a: const pointer to input polynomial, - * with each coefficient in the range [0,1,..,Q-1] - * OUTPUT - * - r: pointer to output byte array - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYBYTES)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(r)) -); - - -#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) -/************************************************* - * Name: poly_frombytes - * - * Description: De-serialization of a polynomial. - * - * Arguments: INPUT - * - a: pointer to input byte array - * (of MLKEM_POLYBYTES bytes) - * OUTPUT - * - r: pointer to output polynomial, with - * each coefficient unsigned and in the range - * 0 .. 4095 - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) -); - - -#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) -/************************************************* - * Name: poly_frommsg - * - * Description: Convert 32-byte message to polynomial - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *msg: pointer to input message - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(object_whole(r)) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) -); - -#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) -/************************************************* - * Name: poly_tomsg - * - * Description: Convert polynomial to 32-byte message - * - * Arguments: - uint8_t *msg: pointer to output message - * - const poly *r: pointer to input polynomial - * Coefficients must be unsigned canonical - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(msg)) -); - #define poly_basemul_montgomery_cached \ MLKEM_NAMESPACE(poly_basemul_montgomery_cached) /************************************************* @@ -715,4 +204,56 @@ __contract__( assigns(object_whole(r)) ); +#define poly_ntt MLKEM_NAMESPACE(poly_ntt) +/************************************************* + * Name: poly_ntt + * + * Description: Computes negacyclic number-theoretic transform (NTT) of + * a polynomial in place. + * + * The input is assumed to be in normal order and + * coefficient-wise bound by MLKEM_Q in absolute value. + * + * The output polynomial is in bitreversed order, and + * coefficient-wise bound by NTT_BOUND in absolute value. + * + * (NOTE: Sometimes the input to the NTT is actually smaller, + * which gives better bounds.) + * + * Arguments: - poly *p: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) +); + +#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) +/************************************************* + * Name: poly_invntt_tomont + * + * Description: Computes inverse of negacyclic number-theoretic transform (NTT) + * of a polynomial in place; + * inputs assumed to be in bitreversed order, output in normal + * order + * + * The input is assumed to be in bitreversed order, and can + * have arbitrary coefficients in int16_t. + * + * The output polynomial is in normal order, and + * coefficient-wise bound by INVNTT_BOUND in absolute value. + * + * Arguments: - uint16_t *a: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) +); + #endif /* POLY_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly_k.c new file mode 100644 index 000000000..c2d330ea9 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly_k.c @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "poly_k.h" +#include +#include +#include "arith_backend.h" +#include "compress.h" +#include "sampling.h" +#include "symmetric.h" + +#include "debug.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1) +#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2) +/* End of static namespacing */ + +MLKEM_NATIVE_INTERNAL_API +void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], + const polyvec *a) +{ + unsigned i; + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_K; i++) + { + poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_decompress_du(polyvec *r, + const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU); + } + + debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) +{ + unsigned i; + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_K; i++) + { + poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES); + } + + debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_ntt(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_ntt(&r->vec[i]); + } + + debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_invntt_tomont(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_invntt_tomont(&r->vec[i]); + } + + debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND); +} + +#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, + const polyvec *b, + const polyvec_mulcache *b_cache) +{ + unsigned i; + poly t; + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); + + poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]); + for (i = 1; i < MLKEM_K; i++) + { + poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i], + &b_cache->vec[i]); + poly_add(r, &t); + } + + /* + * This bound is true for the C implementation, but not needed + * in the higher level bounds reasoning. It is thus omitted + * them from the spec to not unnecessarily constrain native + * implementations, but checked here nonetheless. + */ + debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q); +} +#else /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, + const polyvec *b, + const polyvec_mulcache *b_cache) +{ + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); + /* Omitting bounds assertion for cache since native implementations may + * decide not to use a mulcache. Note that the C backend implementation + * of poly_basemul_montgomery_cached() does still include the check. */ + polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a, + (const int16_t *)b, + (const int16_t *)b_cache); +} +#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ + +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) +{ + polyvec_mulcache b_cache; + polyvec_mulcache_compute(&b_cache, b); + polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_mulcache_compute(&x->vec[i], &a->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_reduce(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_reduce(&r->vec[i]); + } + + debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_add(polyvec *r, const polyvec *b) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_add(&r->vec[i], &b->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_tomont(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_tomont(&r->vec[i]); + } + + debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q); +} + + +/************************************************* + * Name: poly_cbd_eta1 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter MLKEM_ETA1. + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +static INLINE void poly_cbd_eta1(poly *r, + const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4]) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1)) +) +{ +#if MLKEM_ETA1 == 2 + poly_cbd2(r, buf); +#elif MLKEM_ETA1 == 3 + poly_cbd3(r, buf); +#else +#error "Invalid value of MLKEM_ETA1" +#endif +} + +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, + uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) +{ + ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1]; + ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1]; + ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1]; + ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1]; + memcpy(extkey0, seed, MLKEM_SYMBYTES); + memcpy(extkey1, seed, MLKEM_SYMBYTES); + memcpy(extkey2, seed, MLKEM_SYMBYTES); + memcpy(extkey3, seed, MLKEM_SYMBYTES); + extkey0[MLKEM_SYMBYTES] = nonce0; + extkey1[MLKEM_SYMBYTES] = nonce1; + extkey2[MLKEM_SYMBYTES] = nonce2; + extkey3[MLKEM_SYMBYTES] = nonce3; + prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3); + poly_cbd_eta1(r0, buf0); + poly_cbd_eta1(r1, buf1); + poly_cbd_eta1(r2, buf2); + poly_cbd_eta1(r3, buf3); + + debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1); +} + +#if MLKEM_K == 2 || MLKEM_K == 4 +/************************************************* + * Name: poly_cbd_eta2 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter MLKEM_ETA2. + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +static INLINE void poly_cbd_eta2(poly *r, + const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))) +{ +#if MLKEM_ETA2 == 2 + poly_cbd2(r, buf); +#else +#error "Invalid value of MLKEM_ETA2" +#endif +} + +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce) +{ + ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]; + ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1]; + + memcpy(extkey, seed, MLKEM_SYMBYTES); + extkey[MLKEM_SYMBYTES] = nonce; + prf_eta2(buf, extkey); + + poly_cbd_eta2(r, buf); + + debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1); +} +#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ + + +#if MLKEM_K == 2 +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, + uint8_t nonce3) +{ + ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4]; + ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1]; + memcpy(extkey[0], seed, MLKEM_SYMBYTES); + memcpy(extkey[1], seed, MLKEM_SYMBYTES); + memcpy(extkey[2], seed, MLKEM_SYMBYTES); + memcpy(extkey[3], seed, MLKEM_SYMBYTES); + extkey[0][MLKEM_SYMBYTES] = nonce0; + extkey[1][MLKEM_SYMBYTES] = nonce1; + extkey[2][MLKEM_SYMBYTES] = nonce2; + extkey[3][MLKEM_SYMBYTES] = nonce3; + + prf_eta1(buf1[0], extkey[0]); + prf_eta1(buf1[1], extkey[1]); + prf_eta2(buf2[0], extkey[2]); + prf_eta2(buf2[1], extkey[3]); + + poly_cbd_eta1(r0, buf1[0]); + poly_cbd_eta1(r1, buf1[1]); + poly_cbd_eta2(r2, buf2[0]); + poly_cbd_eta2(r3, buf2[1]); + + debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1); + debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1); +} +#endif /* MLKEM_K == 2 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly_k.h new file mode 100644 index 000000000..0aea95912 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly_k.h @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef POLY_K_H +#define POLY_K_H + +#include +#include "common.h" +#include "compress.h" +#include "poly.h" + +#define polyvec MLKEM_NAMESPACE_K(polyvec) +typedef struct +{ + poly vec[MLKEM_K]; +} ALIGN polyvec; + +#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache) +typedef struct +{ + poly_mulcache vec[MLKEM_K]; +} polyvec_mulcache; + +#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du) +/************************************************* + * Name: poly_compress_du + * + * Description: Compression (du bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], + const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))) +{ +#if MLKEM_DU == 10 + poly_compress_d10(r, a); +#elif MLKEM_DU == 11 + poly_compress_d11(r, a); +#else +#error "Invalid value of MLKEM_DU" +#endif +} + +#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du) +/************************************************* + * Name: poly_decompress_du + * + * Description: De-serialization and subsequent decompression (du bits) of a + * polynomial; approximate inverse of poly_compress_du + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +static INLINE void poly_decompress_du( + poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +{ +#if MLKEM_DU == 10 + poly_decompress_d10(r, a); +#elif MLKEM_DU == 11 + poly_decompress_d11(r, a); +#else +#error "Invalid value of MLKEM_DU" +#endif +} + +#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv) +/************************************************* + * Name: poly_compress_dv + * + * Description: Compression (dv bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], + const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(r))) +{ +#if MLKEM_DV == 4 + poly_compress_d4(r, a); +#elif MLKEM_DV == 5 + poly_compress_d5(r, a); +#else +#error "Invalid value of MLKEM_DV" +#endif +} + + +#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv) +/************************************************* + * Name: poly_decompress_dv + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +static INLINE void poly_decompress_dv( + poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(object_whole(r)) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +{ +#if MLKEM_DV == 4 + poly_decompress_d4(r, a); +#elif MLKEM_DV == 5 + poly_decompress_d5(r, a); +#else +#error "Invalid value of MLKEM_DV" +#endif +} + +#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du) +/************************************************* + * Name: polyvec_compress_du + * + * Description: Compress and serialize vector of polynomials + * + * Arguments: - uint8_t *r: pointer to output byte array + * (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU) + * - const polyvec *a: pointer to input vector of polynomials. + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], + const polyvec *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(a, sizeof(polyvec))) + requires(forall(k0, 0, MLKEM_K, + array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) + assigns(object_whole(r)) +); + +#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du) +/************************************************* + * Name: polyvec_decompress_du + * + * Description: De-serialize and decompress vector of polynomials; + * approximate inverse of polyvec_compress_du + * + * Arguments: - polyvec *r: pointer to output vector of polynomials. + * Output will have coefficients normalized to [0,..,q-1]. + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU) + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_decompress_du(polyvec *r, + const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(k0, 0, MLKEM_K, + array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +); + +#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes) +/************************************************* + * Name: polyvec_tobytes + * + * Description: Serialize vector of polynomials + * + * Arguments: - uint8_t *r: pointer to output byte array + * (needs space for MLKEM_POLYVECBYTES) + * - const polyvec *a: pointer to input vector of polynomials + * Each polynomial must have coefficients in [0,..,q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) +__contract__( + requires(memory_no_alias(a, sizeof(polyvec))) + requires(memory_no_alias(r, MLKEM_POLYVECBYTES)) + requires(forall(k0, 0, MLKEM_K, + array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) + assigns(object_whole(r)) +); + +#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes) +/************************************************* + * Name: polyvec_frombytes + * + * Description: De-serialize vector of polynomials; + * inverse of polyvec_tobytes + * + * Arguments: - const polyvec *a: pointer to output vector of polynomials + * (of length MLKEM_POLYVECBYTES). Output will have coefficients + * normalized in [0..4095]. + * - uint8_t *r: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + requires(memory_no_alias(a, MLKEM_POLYVECBYTES)) + assigns(object_whole(r)) + ensures(forall(k0, 0, MLKEM_K, + array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) +); + +#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt) +/************************************************* + * Name: polyvec_ntt + * + * Description: Apply forward NTT to all elements of a vector of polynomials. + * + * The input is assumed to be in normal order and + * coefficient-wise bound by MLKEM_Q in absolute value. + * + * The output polynomial is in bitreversed order, and + * coefficient-wise bound by NTT_BOUND in absolute value. + * + * Arguments: - polyvec *r: pointer to in/output vector of polynomials + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_ntt(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + requires(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) + assigns(object_whole(r)) + ensures(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND))) +); + +#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont) +/************************************************* + * Name: polyvec_invntt_tomont + * + * Description: Apply inverse NTT to all elements of a vector of polynomials + * and multiply by Montgomery factor 2^16 + * + * The input is assumed to be in bitreversed order, and can + * have arbitrary coefficients in int16_t. + * + * The output polynomial is in normal order, and + * coefficient-wise bound by INVNTT_BOUND in absolute value. + * + * + * Arguments: - polyvec *r: pointer to in/output vector of polynomials + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_invntt_tomont(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND))) +); + +#define polyvec_basemul_acc_montgomery \ + MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery) +/************************************************* + * Name: polyvec_basemul_acc_montgomery + * + * Description: Multiply elements of a and b in NTT domain, accumulate into r, + * and multiply by 2^-16. + * + * Arguments: - poly *r: pointer to output polynomial + * - const polyvec *a: pointer to first input vector of polynomials + * - const polyvec *b: pointer to second input vector of polynomials + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(a, sizeof(polyvec))) + requires(memory_no_alias(b, sizeof(polyvec))) + requires(forall(k1, 0, MLKEM_K, + array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) + assigns(memory_slice(r, sizeof(poly))) +); + + +#define polyvec_basemul_acc_montgomery_cached \ + MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached) +/************************************************* + * Name: polyvec_basemul_acc_montgomery_cached + * + * Description: Scalar product of two vectors of polynomials in NTT domain, + * using mulcache for second operand. + * + * Bounds: + * - Every coefficient of a is assumed to be in [0..4095] + * - No bounds guarantees for the coefficients in the result. + * + * Arguments: - poly *r: pointer to output polynomial + * - const polyvec *a: pointer to first input polynomial vector + * - const polyvec *b: pointer to second input polynomial vector + * - const polyvec_mulcache *b_cache: pointer to mulcache + * for second input polynomial vector. Can be computed + * via polyvec_mulcache_compute(). + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, + const polyvec *b, + const polyvec_mulcache *b_cache) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(a, sizeof(polyvec))) + requires(memory_no_alias(b, sizeof(polyvec))) + requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache))) + requires(forall(k1, 0, MLKEM_K, + array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) + assigns(memory_slice(r, sizeof(poly))) +); + +#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute) +/************************************************************ + * Name: polyvec_mulcache_compute + * + * Description: Computes the mulcache for a vector of polynomials in NTT domain + * + * The mulcache of a degree-2 polynomial b := b0 + b1*X + * in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when + * computing products of b in Fq[X]/(X^2-zeta). + * + * The mulcache of a polynomial in NTT domain -- which is + * a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta), + * for varying zeta, is the 128-tuple of mulcaches of those + * polynomials. + * + * The mulcache of a vector of polynomials is the vector + * of mulcaches of its entries. + * + * Arguments: - x: Pointer to mulcache to be populated + * - a: Pointer to input polynomial vector + ************************************************************/ +/* + * NOTE: The default C implementation of this function populates + * the mulcache with values in (-q,q), but this is not needed for the + * higher level safety proofs, and thus not part of the spec. + */ +MLKEM_NATIVE_INTERNAL_API +void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) +__contract__( + requires(memory_no_alias(x, sizeof(polyvec_mulcache))) + requires(memory_no_alias(a, sizeof(polyvec))) + assigns(object_whole(x)) +); + +#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce) +/************************************************* + * Name: polyvec_reduce + * + * Description: Applies Barrett reduction to each coefficient + * of each element of a vector of polynomials; + * for details of the Barrett reduction see comments in reduce.c + * + * Arguments: - polyvec *r: pointer to input/output polynomial + **************************************************/ +/* + * NOTE: The semantics of polyvec_reduce() is different in + * the reference implementation, which requires + * signed canonical output data. Unsigned canonical + * outputs are better suited to the only remaining + * use of poly_reduce() in the context of (de)serialization. + */ +MLKEM_NATIVE_INTERNAL_API +void polyvec_reduce(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(k0, 0, MLKEM_K, + array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +); + +#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add) +/************************************************* + * Name: polyvec_add + * + * Description: Add vectors of polynomials + * + * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be + * added to + * - const polyvec *b: pointer to second input vector of polynomials + * + * The coefficients of r and b must be so that the addition does + * not overflow. Otherwise, the behaviour of this function is undefined. + * + * The coefficients returned in *r are in int16_t which is sufficient + * to prove type-safety of calling units. Therefore, no stronger + * ensures clause is required on this function. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_add(polyvec *r, const polyvec *b) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + requires(memory_no_alias(b, sizeof(polyvec))) + requires(forall(j0, 0, MLKEM_K, + forall(k0, 0, MLKEM_N, + (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX))) + requires(forall(j1, 0, MLKEM_K, + forall(k1, 0, MLKEM_N, + (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN))) + assigns(object_whole(r)) +); + +#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont) +/************************************************* + * Name: polyvec_tomont + * + * Description: Inplace conversion of all coefficients of a polynomial + * vector from normal domain to Montgomery domain + * + * Bounds: Output < q in absolute value. + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_tomont(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(memory_slice(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) +); + +#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x) +/************************************************* + * Name: poly_getnoise_eta1_4x + * + * Description: Batch sample four polynomials deterministically from a seed + * and nonces, with output polynomials close to centered binomial distribution + * with parameter MLKEM_ETA1. + * + * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial + * - const uint8_t *seed: pointer to input seed + * (of length MLKEM_SYMBYTES bytes) + * - uint8_t nonce{0,1,2,3}: one-byte input nonce + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, + uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) +/* Depending on MLKEM_K, the pointers passed to this function belong + to the same objects, so we cannot use memory_no_alias for r0-r3. + + NOTE: Somehow it is important to use memory_no_alias() first in the + conjunctions defining each case. +*/ +#if MLKEM_K == 2 +__contract__( + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */ + (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && + r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) + assigns(memory_slice(r0, sizeof(poly))) + assigns(memory_slice(r1, sizeof(poly))) + assigns(memory_slice(r2, sizeof(poly))) + assigns(memory_slice(r3, sizeof(poly))) + ensures( + array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); +); +#elif MLKEM_K == 4 +__contract__( + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + requires( /* Case B: r0, r1, r2, r3 consecutive */ + (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3)) + assigns(memory_slice(r0, sizeof(poly))) + assigns(memory_slice(r1, sizeof(poly))) + assigns(memory_slice(r2, sizeof(poly))) + assigns(memory_slice(r3, sizeof(poly))) + ensures( + array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); +); +#elif MLKEM_K == 3 +__contract__( + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + requires( /* Case C: r0, r1, r2 consecutive */ + (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) && + r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0))) + assigns(memory_slice(r0, sizeof(poly))) + assigns(memory_slice(r1, sizeof(poly))) + assigns(memory_slice(r2, sizeof(poly))) + assigns(memory_slice(r3, sizeof(poly))) + ensures( + array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); +); +#endif /* MLKEM_K */ + +#if MLKEM_ETA1 == MLKEM_ETA2 +/* + * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024 + * where MLKEM_ETA2 = MLKEM_ETA1 = 2. + * For ml-kem-512, poly_getnoise_eta1122_4x is used instead. + */ +#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x +#endif /* MLKEM_ETA1 == MLKEM_ETA2 */ + +#if MLKEM_K == 2 || MLKEM_K == 4 +#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2) +/************************************************* + * Name: poly_getnoise_eta2 + * + * Description: Sample a polynomial deterministically from a seed and a nonce, + * with output polynomial close to centered binomial distribution + * with parameter MLKEM_ETA2 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *seed: pointer to input seed + * (of length MLKEM_SYMBYTES bytes) + * - uint8_t nonce: one-byte input nonce + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + assigns(object_whole(r)) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)) +); +#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ + +#if MLKEM_K == 2 +#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x) +/************************************************* + * Name: poly_getnoise_eta1122_4x + * + * Description: Batch sample four polynomials deterministically from a seed + * and a nonces, with output polynomials close to centered binomial + * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2 + * + * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial + * - const uint8_t *seed: pointer to input seed + * (of length MLKEM_SYMBYTES bytes) + * - uint8_t nonce{0,1,2,3}: one-byte input nonce + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, + uint8_t nonce3) +__contract__( + requires( /* r0, r1 consecutive, r2, r3 consecutive */ + (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && + r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3)) + ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)); +); +#endif /* MLKEM_K == 2 */ + +#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.c deleted file mode 100644 index 50ea1c34a..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.c +++ /dev/null @@ -1,330 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "polyvec.h" -#include -#include -#include "arith_backend.h" -#include "cbd.h" -#include "ntt.h" -#include "poly.h" -#include "symmetric.h" - -#include "debug.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1) -#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2) -/* End of static namespacing */ - -MLKEM_NATIVE_INTERNAL_API -void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], - const polyvec *a) -{ - unsigned i; - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_K; i++) - { - poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_decompress_du(polyvec *r, - const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU); - } - - debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) -{ - unsigned i; - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_K; i++) - { - poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES); - } - - debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_ntt(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_ntt(&r->vec[i]); - } - - debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_invntt_tomont(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_invntt_tomont(&r->vec[i]); - } - - debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND); -} - -#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) -{ - unsigned i; - poly t; - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); - - poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]); - for (i = 1; i < MLKEM_K; i++) - { - poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i], - &b_cache->vec[i]); - poly_add(r, &t); - } - - /* - * This bound is true for the C implementation, but not needed - * in the higher level bounds reasoning. It is thus omitted - * them from the spec to not unnecessarily constrain native - * implementations, but checked here nonetheless. - */ - debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q); -} -#else /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) -{ - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); - /* Omitting bounds assertion for cache since native implementations may - * decide not to use a mulcache. Note that the C backend implementation - * of poly_basemul_montgomery_cached() does still include the check. */ - polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache); -} -#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ - -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) -{ - polyvec_mulcache b_cache; - polyvec_mulcache_compute(&b_cache, b); - polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_mulcache_compute(&x->vec[i], &a->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_reduce(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_reduce(&r->vec[i]); - } - - debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_add(polyvec *r, const polyvec *b) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_add(&r->vec[i], &b->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_tomont(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_tomont(&r->vec[i]); - } - - debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q); -} - - -/************************************************* - * Name: poly_cbd_eta1 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter MLKEM_ETA1. - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -static INLINE void poly_cbd_eta1(poly *r, - const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4]) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1)) -) -{ -#if MLKEM_ETA1 == 2 - poly_cbd2(r, buf); -#elif MLKEM_ETA1 == 3 - poly_cbd3(r, buf); -#else -#error "Invalid value of MLKEM_ETA1" -#endif -} - -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, - uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) -{ - ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1]; - ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1]; - ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1]; - ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1]; - memcpy(extkey0, seed, MLKEM_SYMBYTES); - memcpy(extkey1, seed, MLKEM_SYMBYTES); - memcpy(extkey2, seed, MLKEM_SYMBYTES); - memcpy(extkey3, seed, MLKEM_SYMBYTES); - extkey0[MLKEM_SYMBYTES] = nonce0; - extkey1[MLKEM_SYMBYTES] = nonce1; - extkey2[MLKEM_SYMBYTES] = nonce2; - extkey3[MLKEM_SYMBYTES] = nonce3; - prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3); - poly_cbd_eta1(r0, buf0); - poly_cbd_eta1(r1, buf1); - poly_cbd_eta1(r2, buf2); - poly_cbd_eta1(r3, buf3); - - debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1); -} - -#if MLKEM_K == 2 || MLKEM_K == 4 -/************************************************* - * Name: poly_cbd_eta2 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter MLKEM_ETA2. - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -static INLINE void poly_cbd_eta2(poly *r, - const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))) -{ -#if MLKEM_ETA2 == 2 - poly_cbd2(r, buf); -#else -#error "Invalid value of MLKEM_ETA2" -#endif -} - -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce) -{ - ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]; - ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1]; - - memcpy(extkey, seed, MLKEM_SYMBYTES); - extkey[MLKEM_SYMBYTES] = nonce; - prf_eta2(buf, extkey); - - poly_cbd_eta2(r, buf); - - debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1); -} -#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ - - -#if MLKEM_K == 2 -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, - uint8_t nonce3) -{ - ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4]; - ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1]; - memcpy(extkey[0], seed, MLKEM_SYMBYTES); - memcpy(extkey[1], seed, MLKEM_SYMBYTES); - memcpy(extkey[2], seed, MLKEM_SYMBYTES); - memcpy(extkey[3], seed, MLKEM_SYMBYTES); - extkey[0][MLKEM_SYMBYTES] = nonce0; - extkey[1][MLKEM_SYMBYTES] = nonce1; - extkey[2][MLKEM_SYMBYTES] = nonce2; - extkey[3][MLKEM_SYMBYTES] = nonce3; - - prf_eta1(buf1[0], extkey[0]); - prf_eta1(buf1[1], extkey[1]); - prf_eta2(buf2[0], extkey[2]); - prf_eta2(buf2[1], extkey[3]); - - poly_cbd_eta1(r0, buf1[0]); - poly_cbd_eta1(r1, buf1[1]); - poly_cbd_eta2(r2, buf2[0]); - poly_cbd_eta2(r3, buf2[1]); - - debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1); - debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1); -} -#endif /* MLKEM_K == 2 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.h deleted file mode 100644 index 8be8579e0..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.h +++ /dev/null @@ -1,595 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef POLYVEC_H -#define POLYVEC_H - -#include -#include "common.h" -#include "poly.h" - -#define polyvec MLKEM_NAMESPACE_K(polyvec) -typedef struct -{ - poly vec[MLKEM_K]; -} ALIGN polyvec; - -#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache) -typedef struct -{ - poly_mulcache vec[MLKEM_K]; -} polyvec_mulcache; - -#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du) -/************************************************* - * Name: poly_compress_du - * - * Description: Compression (du bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], - const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))) -{ -#if MLKEM_DU == 10 - poly_compress_d10(r, a); -#elif MLKEM_DU == 11 - poly_compress_d11(r, a); -#else -#error "Invalid value of MLKEM_DU" -#endif -} - -#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du) -/************************************************* - * Name: poly_decompress_du - * - * Description: De-serialization and subsequent decompression (du bits) of a - * polynomial; approximate inverse of poly_compress_du - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -static INLINE void poly_decompress_du( - poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -{ -#if MLKEM_DU == 10 - poly_decompress_d10(r, a); -#elif MLKEM_DU == 11 - poly_decompress_d11(r, a); -#else -#error "Invalid value of MLKEM_DU" -#endif -} - -#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv) -/************************************************* - * Name: poly_compress_dv - * - * Description: Compression (dv bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], - const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(r))) -{ -#if MLKEM_DV == 4 - poly_compress_d4(r, a); -#elif MLKEM_DV == 5 - poly_compress_d5(r, a); -#else -#error "Invalid value of MLKEM_DV" -#endif -} - - -#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv) -/************************************************* - * Name: poly_decompress_dv - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -static INLINE void poly_decompress_dv( - poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(object_whole(r)) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -{ -#if MLKEM_DV == 4 - poly_decompress_d4(r, a); -#elif MLKEM_DV == 5 - poly_decompress_d5(r, a); -#else -#error "Invalid value of MLKEM_DV" -#endif -} - -#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du) -/************************************************* - * Name: polyvec_compress_du - * - * Description: Compress and serialize vector of polynomials - * - * Arguments: - uint8_t *r: pointer to output byte array - * (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU) - * - const polyvec *a: pointer to input vector of polynomials. - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], - const polyvec *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(a, sizeof(polyvec))) - requires(forall(k0, 0, MLKEM_K, - array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) - assigns(object_whole(r)) -); - -#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du) -/************************************************* - * Name: polyvec_decompress_du - * - * Description: De-serialize and decompress vector of polynomials; - * approximate inverse of polyvec_compress_du - * - * Arguments: - polyvec *r: pointer to output vector of polynomials. - * Output will have coefficients normalized to [0,..,q-1]. - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU) - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_decompress_du(polyvec *r, - const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(k0, 0, MLKEM_K, - array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -); - -#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes) -/************************************************* - * Name: polyvec_tobytes - * - * Description: Serialize vector of polynomials - * - * Arguments: - uint8_t *r: pointer to output byte array - * (needs space for MLKEM_POLYVECBYTES) - * - const polyvec *a: pointer to input vector of polynomials - * Each polynomial must have coefficients in [0,..,q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) -__contract__( - requires(memory_no_alias(a, sizeof(polyvec))) - requires(memory_no_alias(r, MLKEM_POLYVECBYTES)) - requires(forall(k0, 0, MLKEM_K, - array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) - assigns(object_whole(r)) -); - -#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes) -/************************************************* - * Name: polyvec_frombytes - * - * Description: De-serialize vector of polynomials; - * inverse of polyvec_tobytes - * - * Arguments: - const polyvec *a: pointer to output vector of polynomials - * (of length MLKEM_POLYVECBYTES). Output will have coefficients - * normalized in [0..4095]. - * - uint8_t *r: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - requires(memory_no_alias(a, MLKEM_POLYVECBYTES)) - assigns(object_whole(r)) - ensures(forall(k0, 0, MLKEM_K, - array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) -); - -#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt) -/************************************************* - * Name: polyvec_ntt - * - * Description: Apply forward NTT to all elements of a vector of polynomials. - * - * The input is assumed to be in normal order and - * coefficient-wise bound by MLKEM_Q in absolute value. - * - * The output polynomial is in bitreversed order, and - * coefficient-wise bound by NTT_BOUND in absolute value. - * - * Arguments: - polyvec *r: pointer to in/output vector of polynomials - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_ntt(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - requires(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) - assigns(object_whole(r)) - ensures(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND))) -); - -#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont) -/************************************************* - * Name: polyvec_invntt_tomont - * - * Description: Apply inverse NTT to all elements of a vector of polynomials - * and multiply by Montgomery factor 2^16 - * - * The input is assumed to be in bitreversed order, and can - * have arbitrary coefficients in int16_t. - * - * The output polynomial is in normal order, and - * coefficient-wise bound by INVNTT_BOUND in absolute value. - * - * - * Arguments: - polyvec *r: pointer to in/output vector of polynomials - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_invntt_tomont(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND))) -); - -#define polyvec_basemul_acc_montgomery \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery) -/************************************************* - * Name: polyvec_basemul_acc_montgomery - * - * Description: Multiply elements of a and b in NTT domain, accumulate into r, - * and multiply by 2^-16. - * - * Arguments: - poly *r: pointer to output polynomial - * - const polyvec *a: pointer to first input vector of polynomials - * - const polyvec *b: pointer to second input vector of polynomials - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(a, sizeof(polyvec))) - requires(memory_no_alias(b, sizeof(polyvec))) - requires(forall(k1, 0, MLKEM_K, - array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) - assigns(memory_slice(r, sizeof(poly))) -); - - -#define polyvec_basemul_acc_montgomery_cached \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached) -/************************************************* - * Name: polyvec_basemul_acc_montgomery_cached - * - * Description: Scalar product of two vectors of polynomials in NTT domain, - * using mulcache for second operand. - * - * Bounds: - * - Every coefficient of a is assumed to be in [0..4095] - * - No bounds guarantees for the coefficients in the result. - * - * Arguments: - poly *r: pointer to output polynomial - * - const polyvec *a: pointer to first input polynomial vector - * - const polyvec *b: pointer to second input polynomial vector - * - const polyvec_mulcache *b_cache: pointer to mulcache - * for second input polynomial vector. Can be computed - * via polyvec_mulcache_compute(). - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(a, sizeof(polyvec))) - requires(memory_no_alias(b, sizeof(polyvec))) - requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache))) - requires(forall(k1, 0, MLKEM_K, - array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) - assigns(memory_slice(r, sizeof(poly))) -); - -#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute) -/************************************************************ - * Name: polyvec_mulcache_compute - * - * Description: Computes the mulcache for a vector of polynomials in NTT domain - * - * The mulcache of a degree-2 polynomial b := b0 + b1*X - * in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when - * computing products of b in Fq[X]/(X^2-zeta). - * - * The mulcache of a polynomial in NTT domain -- which is - * a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta), - * for varying zeta, is the 128-tuple of mulcaches of those - * polynomials. - * - * The mulcache of a vector of polynomials is the vector - * of mulcaches of its entries. - * - * Arguments: - x: Pointer to mulcache to be populated - * - a: Pointer to input polynomial vector - ************************************************************/ -/* - * NOTE: The default C implementation of this function populates - * the mulcache with values in (-q,q), but this is not needed for the - * higher level safety proofs, and thus not part of the spec. - */ -MLKEM_NATIVE_INTERNAL_API -void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) -__contract__( - requires(memory_no_alias(x, sizeof(polyvec_mulcache))) - requires(memory_no_alias(a, sizeof(polyvec))) - assigns(object_whole(x)) -); - -#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce) -/************************************************* - * Name: polyvec_reduce - * - * Description: Applies Barrett reduction to each coefficient - * of each element of a vector of polynomials; - * for details of the Barrett reduction see comments in reduce.c - * - * Arguments: - polyvec *r: pointer to input/output polynomial - **************************************************/ -/* - * NOTE: The semantics of polyvec_reduce() is different in - * the reference implementation, which requires - * signed canonical output data. Unsigned canonical - * outputs are better suited to the only remaining - * use of poly_reduce() in the context of (de)serialization. - */ -MLKEM_NATIVE_INTERNAL_API -void polyvec_reduce(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(k0, 0, MLKEM_K, - array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -); - -#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add) -/************************************************* - * Name: polyvec_add - * - * Description: Add vectors of polynomials - * - * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be - * added to - * - const polyvec *b: pointer to second input vector of polynomials - * - * The coefficients of r and b must be so that the addition does - * not overflow. Otherwise, the behaviour of this function is undefined. - * - * The coefficients returned in *r are in int16_t which is sufficient - * to prove type-safety of calling units. Therefore, no stronger - * ensures clause is required on this function. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_add(polyvec *r, const polyvec *b) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - requires(memory_no_alias(b, sizeof(polyvec))) - requires(forall(j0, 0, MLKEM_K, - forall(k0, 0, MLKEM_N, - (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX))) - requires(forall(j1, 0, MLKEM_K, - forall(k1, 0, MLKEM_N, - (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN))) - assigns(object_whole(r)) -); - -#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont) -/************************************************* - * Name: polyvec_tomont - * - * Description: Inplace conversion of all coefficients of a polynomial - * vector from normal domain to Montgomery domain - * - * Bounds: Output < q in absolute value. - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_tomont(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(memory_slice(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) -); - -#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x) -/************************************************* - * Name: poly_getnoise_eta1_4x - * - * Description: Batch sample four polynomials deterministically from a seed - * and nonces, with output polynomials close to centered binomial distribution - * with parameter MLKEM_ETA1. - * - * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial - * - const uint8_t *seed: pointer to input seed - * (of length MLKEM_SYMBYTES bytes) - * - uint8_t nonce{0,1,2,3}: one-byte input nonce - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, - uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) -/* Depending on MLKEM_K, the pointers passed to this function belong - to the same objects, so we cannot use memory_no_alias for r0-r3. - - NOTE: Somehow it is important to use memory_no_alias() first in the - conjunctions defining each case. -*/ -#if MLKEM_K == 2 -__contract__( - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */ - (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && - r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) - assigns(memory_slice(r0, sizeof(poly))) - assigns(memory_slice(r1, sizeof(poly))) - assigns(memory_slice(r2, sizeof(poly))) - assigns(memory_slice(r3, sizeof(poly))) - ensures( - array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); -); -#elif MLKEM_K == 4 -__contract__( - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - requires( /* Case B: r0, r1, r2, r3 consecutive */ - (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3)) - assigns(memory_slice(r0, sizeof(poly))) - assigns(memory_slice(r1, sizeof(poly))) - assigns(memory_slice(r2, sizeof(poly))) - assigns(memory_slice(r3, sizeof(poly))) - ensures( - array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); -); -#elif MLKEM_K == 3 -__contract__( - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - requires( /* Case C: r0, r1, r2 consecutive */ - (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) && - r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0))) - assigns(memory_slice(r0, sizeof(poly))) - assigns(memory_slice(r1, sizeof(poly))) - assigns(memory_slice(r2, sizeof(poly))) - assigns(memory_slice(r3, sizeof(poly))) - ensures( - array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); -); -#endif /* MLKEM_K */ - -#if MLKEM_ETA1 == MLKEM_ETA2 -/* - * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024 - * where MLKEM_ETA2 = MLKEM_ETA1 = 2. - * For ml-kem-512, poly_getnoise_eta1122_4x is used instead. - */ -#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x -#endif /* MLKEM_ETA1 == MLKEM_ETA2 */ - -#if MLKEM_K == 2 || MLKEM_K == 4 -#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2) -/************************************************* - * Name: poly_getnoise_eta2 - * - * Description: Sample a polynomial deterministically from a seed and a nonce, - * with output polynomial close to centered binomial distribution - * with parameter MLKEM_ETA2 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *seed: pointer to input seed - * (of length MLKEM_SYMBYTES bytes) - * - uint8_t nonce: one-byte input nonce - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - assigns(object_whole(r)) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)) -); -#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ - -#if MLKEM_K == 2 -#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x) -/************************************************* - * Name: poly_getnoise_eta1122_4x - * - * Description: Batch sample four polynomials deterministically from a seed - * and a nonces, with output polynomials close to centered binomial - * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2 - * - * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial - * - const uint8_t *seed: pointer to input seed - * (of length MLKEM_SYMBYTES bytes) - * - uint8_t nonce{0,1,2,3}: one-byte input nonce - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, - uint8_t nonce3) -__contract__( - requires( /* r0, r1 consecutive, r2, r3 consecutive */ - (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && - r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3)) - ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)); -); -#endif /* MLKEM_K == 2 */ - -#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/reduce.h deleted file mode 100644 index b432a4201..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/reduce.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef REDUCE_H -#define REDUCE_H - -#include -#include "cbmc.h" -#include "common.h" -#include "debug.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) -#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) -#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) -#define fqmul MLKEM_NAMESPACE(fqmul) -#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) -/* End of static namespacing */ - -#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ - -/************************************************* - * Name: cast_uint16_to_int16 - * - * Description: Cast uint16 value to int16 - * - * Returns: - * input x in 0 .. 32767: returns value unchanged - * input x in 32768 .. 65535: returns (x - 65536) - **************************************************/ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "conversion" -#endif -ALWAYS_INLINE -static INLINE int16_t cast_uint16_to_int16(uint16_t x) -{ - /* - * PORTABILITY: This relies on uint16_t -> int16_t - * being implemented as the inverse of int16_t -> uint16_t, - * which is implementation-defined (C99 6.3.1.3 (3)) - * CBMC (correctly) fails to prove this conversion is OK, - * so we have to suppress that check here - */ - return (int16_t)x; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************* - * Name: montgomery_reduce_generic - * - * Description: Generic Montgomery reduction; given a 32-bit integer a, computes - * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 - * - * Arguments: - int32_t a: input integer to be reduced - * - * Returns: integer congruent to a * R^-1 modulo q, with absolute value - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 - * - **************************************************/ -ALWAYS_INLINE -static INLINE int16_t montgomery_reduce_generic(int32_t a) -{ - /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ - const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - - /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ - const uint16_t a_reduced = a & UINT16_MAX; - const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - - /* Lift to signed canonical representative mod 2^16. */ - const int16_t t = cast_uint16_to_int16(a_inverted); - - int32_t r = a - ((int32_t)t * MLKEM_Q); - /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - - /* - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - r = r >> 16; - /* Bounds: |r >> 16| <= ceil(|r| / 2^16) - * <= ceil(|a| / 2^16 + MLKEM_Q / 2) - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * - * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) - */ - - return (int16_t)r; -} - -/************************************************* - * Name: montgomery_reduce - * - * Description: Montgomery reduction - * - * Arguments: - int32_t a: input integer to be reduced - * Must be smaller than 2 * 2^12 * 2^15 in absolute value. - * - * Returns: integer congruent to a * R^-1 modulo q, - * smaller than 2 * q in absolute value. - **************************************************/ -static INLINE int16_t montgomery_reduce(int32_t a) -__contract__( - requires(a > -(2 * UINT12_LIMIT * 32768)) - requires(a < (2 * UINT12_LIMIT * 32768)) - ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); - - res = montgomery_reduce_generic(a); - /* Bounds: - * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 - * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 - * < 2 * MLKEM_Q */ - - debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); - return res; -} - -/************************************************* - * Name: fqmul - * - * Description: Montgomery multiplication modulo q=3329 - * - * Arguments: - int16_t a: first factor - * Can be any int16_t. - * - int16_t b: second factor. - * Must be signed canonical (abs value <(q+1)/2) - * - * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and - * smaller than q in absolute value. - * - **************************************************/ -static INLINE int16_t fqmul(int16_t a, int16_t b) -__contract__( - requires(b > -HALF_Q) - requires(b < HALF_Q) - ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&b, 1, HALF_Q); - - res = montgomery_reduce((int32_t)a * (int32_t)b); - /* Bounds: - * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 - * < MLKEM_Q - */ - - debug_assert_abs_bound(&res, 1, MLKEM_Q); - return res; -} - -/************************************************* - * Name: barrett_reduce - * - * Description: Barrett reduction; given a 16-bit integer a, computes - * centered representative congruent to a mod q in - * {-(q-1)/2,...,(q-1)/2} - * - * Arguments: - int16_t a: input integer to be reduced - * - * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. - **************************************************/ -static INLINE int16_t barrett_reduce(int16_t a) -__contract__( - ensures(return_value > -HALF_Q && return_value < HALF_Q) -) -{ - /* - * To divide by MLKEM_Q using Barrett multiplication, the "magic number" - * multiplier is round_to_nearest(2**26/MLKEM_Q) - */ - const int BPOWER = 26; - const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - - /* - * Compute round_to_nearest(a/MLKEM_Q) using the multiplier - * above and shift by BPOWER places. - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; - - /* - * t is in -10 .. +10, so we need 32-bit math to - * evaluate t * MLKEM_Q and the subsequent subtraction - */ - int16_t res = (int16_t)(a - t * MLKEM_Q); - - debug_assert_abs_bound(&res, 1, HALF_Q); - return res; -} - -#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.c deleted file mode 100644 index cbbe4407f..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.c +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) - -#include "arith_backend.h" -#include "debug.h" -#include "fips202.h" -#include "fips202x4.h" -#include "rej_uniform.h" -#include "symmetric.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define rej_uniform MLKEM_NAMESPACE(rej_uniform) -#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar) -/* End of static namespacing */ - -static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target, - unsigned int offset, const uint8_t *buf, - unsigned int buflen) -__contract__( - requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) - requires(memory_no_alias(r, sizeof(int16_t) * target)) - requires(memory_no_alias(buf, buflen)) - requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * target)) - ensures(offset <= return_value && return_value <= target) - ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) -) -{ - unsigned int ctr, pos; - uint16_t val0, val1; - - debug_assert_bound(r, offset, 0, MLKEM_Q); - - ctr = offset; - pos = 0; - /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */ - while (ctr < target && pos + 3 <= buflen) - __loop__( - invariant(offset <= ctr && ctr <= target && pos <= buflen) - invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q))) - { - val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; - val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; - pos += 3; - - if (val0 < MLKEM_Q) - { - r[ctr++] = val0; - } - if (ctr < target && val1 < MLKEM_Q) - { - r[ctr++] = val1; - } - } - - debug_assert_bound(r, ctr, 0, MLKEM_Q); - return ctr; -} - -#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM) -/************************************************* - * Name: rej_uniform - * - * Description: Run rejection sampling on uniform random bytes to generate - * uniform random integers mod q - * - * Arguments: - int16_t *r: pointer to output buffer - * - unsigned int target: requested number of 16-bit integers - * (uniform mod q). - * Must be <= 4096. - * - unsigned int offset: number of 16-bit integers that have - * already been sampled. - * Must be <= target. - * - const uint8_t *buf: pointer to input buffer - * (assumed to be uniform random bytes) - * - unsigned int buflen: length of input buffer in bytes - * Must be <= 4096. - * Must be a multiple of 3. - * - * Note: Strictly speaking, only a few values of buflen near UINT_MAX need - * excluding. The limit of 4096 is somewhat arbitary but sufficient for all - * uses of this function. Similarly, the actual limit for target is UINT_MAX/2. - * - * Returns the new offset of sampled 16-bit integers, at most target, - * and at least the initial offset. - * If the new offset is strictly less than len, all of the input buffers - * is guaranteed to have been consumed. If it is equal to len, no information - * is provided on how many bytes of the input buffer have been consumed. - **************************************************/ - -/* - * NOTE: The signature differs from the Kyber reference implementation - * in that it adds the offset and always expects the base of the target - * buffer. This avoids shifting the buffer base in the caller, which appears - * tricky to reason about. - */ -static unsigned int rej_uniform(int16_t *r, unsigned int target, - unsigned int offset, const uint8_t *buf, - unsigned int buflen) -__contract__( - requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) - requires(memory_no_alias(r, sizeof(int16_t) * target)) - requires(memory_no_alias(buf, buflen)) - requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * target)) - ensures(offset <= return_value && return_value <= target) - ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) -) -{ - return rej_uniform_scalar(r, target, offset, buf, buflen); -} -#else /* MLKEM_USE_NATIVE_REJ_UNIFORM */ -static unsigned int rej_uniform(int16_t *r, unsigned int target, - unsigned int offset, const uint8_t *buf, - unsigned int buflen) -{ - int ret; - - /* Sample from large buffer with full lane as much as possible. */ - ret = rej_uniform_native(r + offset, target - offset, buf, buflen); - if (ret != -1) - { - unsigned res = offset + (unsigned)ret; - debug_assert_bound(r, res, 0, MLKEM_Q); - return res; - } - - return rej_uniform_scalar(r, target, offset, buf, buflen); -} -#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ - -#ifndef MLKEM_GEN_MATRIX_NBLOCKS -#define MLKEM_GEN_MATRIX_NBLOCKS \ - ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE) -#endif - -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) -{ - /* Temporary buffers for XOF output before rejection sampling */ - uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - - /* Tracks the number of coefficients we have already sampled */ - unsigned int ctr[KECCAK_WAY]; - xof_x4_ctx statex; - unsigned int buflen; - - shake128x4_inc_init(&statex); - - /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */ - xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3], - MLKEM_SYMBYTES + 2); - - /* - * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. - * This should generate the matrix entries with high probability. - */ - xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS, - &statex); - buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; - ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen); - ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen); - ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen); - ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen); - - /* - * So long as not all matrix entries have been generated, squeeze - * one more block a time until we're done. - */ - buflen = XOF_RATE; - while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N || - ctr[3] < MLKEM_N) - __loop__( - assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0), - object_whole(buf1), object_whole(buf2), object_whole(buf3)) - invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N) - invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N) - invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q)) - invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q)) - invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q)) - invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q))) - { - xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex); - ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen); - ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen); - ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen); - ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen); - } - - xof_x4_release(&statex); -} - -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) -{ - xof_ctx state; - uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - unsigned int ctr, buflen; - - shake128_inc_init(&state); - - xof_absorb(&state, seed, MLKEM_SYMBYTES + 2); - - /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. - */ - /* This should generate the matrix entry with high probability. */ - xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state); - buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; - ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen); - - /* Squeeze + sample one more block a time until we're done */ - buflen = XOF_RATE; - while (ctr < MLKEM_N) - __loop__( - assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf)) - invariant(ctr <= MLKEM_N) - invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q))) - { - xof_squeezeblocks(buf, 1, &state); - ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen); - } - - xof_release(&state); -} - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform) -int empty_cu_rej_uniform; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.h deleted file mode 100644 index 801287259..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef REJ_UNIFORM_H -#define REJ_UNIFORM_H - -#include -#include -#include "cbmc.h" -#include "common.h" -#include "poly.h" - -#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4) -/************************************************* - * Name: poly_rej_uniform_x4 - * - * Description: Generate four polynomials using rejection sampling - * on (pseudo-)uniformly random bytes sampled from a seed. - * - * Arguments: - poly *vec: Pointer to an array of 4 polynomials - * to be sampled. - * - uint8_t *seed[4]: Pointer to array of four pointers - * pointing to the seed buffers of size - * MLKEM_SYMBYTES + 2 each. - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) -__contract__( - requires(memory_no_alias(vec, sizeof(poly) * 4)) - requires(memory_no_alias(seed, sizeof(uint8_t*) * 4)) - requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2)) - requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2)) - requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2)) - requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2)) - assigns(memory_slice(vec, sizeof(poly) * 4)) - ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q))); - -#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform) -/************************************************* - * Name: poly_rej_uniform - * - * Description: Generate polynomial using rejection sampling - * on (pseudo-)uniformly random bytes sampled from a seed. - * - * Arguments: - poly *vec: Pointer to polynomial to be sampled. - * - uint8_t *seed: Pointer to seed buffer of size - * MLKEM_SYMBYTES + 2 each. - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) -__contract__( - requires(memory_no_alias(entry, sizeof(poly))) - requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2)) - assigns(memory_slice(entry, sizeof(poly))) - ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q))); - -#endif /* REJ_UNIFORM_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/sampling.c new file mode 100644 index 000000000..98cbdcb74 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/sampling.c @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "common.h" +#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) + +#include "arith_backend.h" +#include "debug.h" +#include "fips202.h" +#include "fips202x4.h" +#include "sampling.h" +#include "symmetric.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define rej_uniform MLKEM_NAMESPACE(rej_uniform) +#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar) +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) +/* End of static namespacing */ + +static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +__contract__( + requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) + requires(memory_no_alias(r, sizeof(int16_t) * target)) + requires(memory_no_alias(buf, buflen)) + requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * target)) + ensures(offset <= return_value && return_value <= target) + ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) +) +{ + unsigned int ctr, pos; + uint16_t val0, val1; + + debug_assert_bound(r, offset, 0, MLKEM_Q); + + ctr = offset; + pos = 0; + /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */ + while (ctr < target && pos + 3 <= buflen) + __loop__( + invariant(offset <= ctr && ctr <= target && pos <= buflen) + invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q))) + { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; + + if (val0 < MLKEM_Q) + { + r[ctr++] = val0; + } + if (ctr < target && val1 < MLKEM_Q) + { + r[ctr++] = val1; + } + } + + debug_assert_bound(r, ctr, 0, MLKEM_Q); + return ctr; +} + +#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM) +/************************************************* + * Name: rej_uniform + * + * Description: Run rejection sampling on uniform random bytes to generate + * uniform random integers mod q + * + * Arguments: - int16_t *r: pointer to output buffer + * - unsigned int target: requested number of 16-bit integers + * (uniform mod q). + * Must be <= 4096. + * - unsigned int offset: number of 16-bit integers that have + * already been sampled. + * Must be <= target. + * - const uint8_t *buf: pointer to input buffer + * (assumed to be uniform random bytes) + * - unsigned int buflen: length of input buffer in bytes + * Must be <= 4096. + * Must be a multiple of 3. + * + * Note: Strictly speaking, only a few values of buflen near UINT_MAX need + * excluding. The limit of 4096 is somewhat arbitary but sufficient for all + * uses of this function. Similarly, the actual limit for target is UINT_MAX/2. + * + * Returns the new offset of sampled 16-bit integers, at most target, + * and at least the initial offset. + * If the new offset is strictly less than len, all of the input buffers + * is guaranteed to have been consumed. If it is equal to len, no information + * is provided on how many bytes of the input buffer have been consumed. + **************************************************/ + +/* + * NOTE: The signature differs from the Kyber reference implementation + * in that it adds the offset and always expects the base of the target + * buffer. This avoids shifting the buffer base in the caller, which appears + * tricky to reason about. + */ +static unsigned int rej_uniform(int16_t *r, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +__contract__( + requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) + requires(memory_no_alias(r, sizeof(int16_t) * target)) + requires(memory_no_alias(buf, buflen)) + requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * target)) + ensures(offset <= return_value && return_value <= target) + ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) +) +{ + return rej_uniform_scalar(r, target, offset, buf, buflen); +} +#else /* MLKEM_USE_NATIVE_REJ_UNIFORM */ +static unsigned int rej_uniform(int16_t *r, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +{ + int ret; + + /* Sample from large buffer with full lane as much as possible. */ + ret = rej_uniform_native(r + offset, target - offset, buf, buflen); + if (ret != -1) + { + unsigned res = offset + (unsigned)ret; + debug_assert_bound(r, res, 0, MLKEM_Q); + return res; + } + + return rej_uniform_scalar(r, target, offset, buf, buflen); +} +#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ + +#ifndef MLKEM_GEN_MATRIX_NBLOCKS +#define MLKEM_GEN_MATRIX_NBLOCKS \ + ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE) +#endif + +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) +{ + /* Temporary buffers for XOF output before rejection sampling */ + uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + + /* Tracks the number of coefficients we have already sampled */ + unsigned int ctr[KECCAK_WAY]; + xof_x4_ctx statex; + unsigned int buflen; + + shake128x4_inc_init(&statex); + + /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */ + xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3], + MLKEM_SYMBYTES + 2); + + /* + * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. + * This should generate the matrix entries with high probability. + */ + xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS, + &statex); + buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; + ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen); + ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen); + ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen); + ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen); + + /* + * So long as not all matrix entries have been generated, squeeze + * one more block a time until we're done. + */ + buflen = XOF_RATE; + while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N || + ctr[3] < MLKEM_N) + __loop__( + assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0), + object_whole(buf1), object_whole(buf2), object_whole(buf3)) + invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N) + invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N) + invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q)) + invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q)) + invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q)) + invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q))) + { + xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex); + ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen); + ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen); + ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen); + ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen); + } + + xof_x4_release(&statex); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) +{ + xof_ctx state; + uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + unsigned int ctr, buflen; + + shake128_inc_init(&state); + + xof_absorb(&state, seed, MLKEM_SYMBYTES + 2); + + /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. + */ + /* This should generate the matrix entry with high probability. */ + xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state); + buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; + ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen); + + /* Squeeze + sample one more block a time until we're done */ + buflen = XOF_RATE; + while (ctr < MLKEM_N) + __loop__( + assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf)) + invariant(ctr <= MLKEM_N) + invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q))) + { + xof_squeezeblocks(buf, 1, &state); + ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen); + } + + xof_release(&state); +} + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) +/* End of static namespacing */ + +/************************************************* + * Name: load32_littleendian + * + * Description: load 4 bytes into a 32-bit integer + * in little-endian order + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x + **************************************************/ +static uint32_t load32_littleendian(const uint8_t x[4]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + r |= (uint32_t)x[3] << 24; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) + { + unsigned j; + uint32_t t = load32_littleendian(buf + 4 * i); + uint32_t d = t & 0x55555555; + d += (t >> 1) & 0x55555555; + + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) + { + const int16_t a = (d >> (4 * j + 0)) & 0x3; + const int16_t b = (d >> (4 * j + 2)) & 0x3; + r->coeffs[8 * i + j] = a - b; + } + } +} + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +/************************************************* + * Name: load24_littleendian + * + * Description: load 3 bytes into a 32-bit integer + * in little-endian order. + * This function is only needed for ML-KEM-512 + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) + **************************************************/ +static uint32_t load24_littleendian(const uint8_t x[3]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 4; i++) + __loop__( + invariant(i <= MLKEM_N / 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) + { + unsigned j; + const uint32_t t = load24_littleendian(buf + 3 * i); + uint32_t d = t & 0x00249249; + d += (t >> 1) & 0x00249249; + d += (t >> 2) & 0x00249249; + + for (j = 0; j < 4; j++) + __loop__( + invariant(i <= MLKEM_N / 4 && j <= 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) + { + const int16_t a = (d >> (6 * j + 0)) & 0x7; + const int16_t b = (d >> (6 * j + 3)) & 0x7; + r->coeffs[4 * i + j] = a - b; + } + } +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ + 3 */ + +#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ + +#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling) +int empty_cu_sampling; + +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/sampling.h new file mode 100644 index 000000000..cc524e0fc --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/sampling.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef SAMPLING_H +#define SAMPLING_H + +#include +#include +#include "cbmc.h" +#include "common.h" +#include "poly.h" + +#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) +/************************************************* + * Name: poly_cbd2 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=2 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) +/************************************************* + * Name: poly_cbd3 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=3. + * This function is only needed for ML-KEM-512 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ + +#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4) +/************************************************* + * Name: poly_rej_uniform_x4 + * + * Description: Generate four polynomials using rejection sampling + * on (pseudo-)uniformly random bytes sampled from a seed. + * + * Arguments: - poly *vec: Pointer to an array of 4 polynomials + * to be sampled. + * - uint8_t *seed[4]: Pointer to array of four pointers + * pointing to the seed buffers of size + * MLKEM_SYMBYTES + 2 each. + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) +__contract__( + requires(memory_no_alias(vec, sizeof(poly) * 4)) + requires(memory_no_alias(seed, sizeof(uint8_t*) * 4)) + requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2)) + requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2)) + requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2)) + requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2)) + assigns(memory_slice(vec, sizeof(poly) * 4)) + ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q))); + +#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform) +/************************************************* + * Name: poly_rej_uniform + * + * Description: Generate polynomial using rejection sampling + * on (pseudo-)uniformly random bytes sampled from a seed. + * + * Arguments: - poly *vec: Pointer to polynomial to be sampled. + * - uint8_t *seed: Pointer to seed buffer of size + * MLKEM_SYMBYTES + 2 each. + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) +__contract__( + requires(memory_no_alias(entry, sizeof(poly))) + requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2)) + assigns(memory_slice(entry, sizeof(poly))) + ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q))); + +#endif /* SAMPLING_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c index 4ef887c62..987f0dce4 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c @@ -10,7 +10,7 @@ #include "common.h" #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) -#include "ntt.h" +#include "poly.h" /* * Table of zeta values used in the reference NTT and inverse NTT. diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/api.h deleted file mode 100644 index 792ecb8a4..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/api.h +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * Native arithmetic interface - * - * This header is primarily for documentation purposes. - * It should not be included by backend implementations. - * - * To ensure consistency with backends, the header will be - * included automatically after inclusion of the active - * backend, to ensure consistency of function signatures, - * and run sanity checks. - */ -#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H -#error \ - "The arithmetic backend API `mlkem/native/api.h` " \ - "should not be directly included. Please include the relevant " \ - "structure headers directly." -#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ -#define MLKEM_NATIVE_ARITH_NATIVE_API_H - -#include -#include "poly.h" -#include "polyvec.h" - -/* - * This is the C<->native interface allowing for the drop-in of - * native code for performance critical arithmetic components of ML-KEM. - * - * A _backend_ is a specific implementation of (part of) this interface. - * - * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and - * implement `static inline xxx(...)` in the profile header. - * - * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can - * be set if there are native implementations for all of NTT, invNTT, and - * base multiplication, and allows the native implementation to use a - * custom order of polynomial coefficients in NTT domain -- the use of such - * custom order is not an implementation-detail since the public matrix - * is generated in NTT domain. In this case, a permutation function - * poly_permute_bitrev_to_custom() needs to be provided that permutes - * polynomials in NTT domain from bitreversed to the custom order. - */ - -/* - * Those functions are meant to be trivial wrappers around the chosen native - * implementation. The are static inline to avoid unnecessary calls. - * The macro before each declaration controls whether a native - * implementation is present. - */ - -#if defined(MLKEM_USE_NATIVE_NTT) -/************************************************* - * Name: ntt_native - * - * Description: Computes negacyclic number-theoretic transform (NTT) of - * a polynomial in place. - * - * The input polynomial is assumed to be in normal order. - * The output polynomial is in bitreversed order, or of a - * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * - * Arguments: - poly *p: pointer to in/output polynomial - **************************************************/ -static INLINE void ntt_native(poly *); -#endif /* MLKEM_USE_NATIVE_NTT */ - -#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER) -/* - * This must only be set if NTT, invNTT, basemul, mulcache, and - * to/from byte stream conversions all have native implementations - * that are adapted to the custom order. - */ -#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \ - !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ - !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) || \ - !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) || \ - !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -#error \ - "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \ -set if there are native implementations for NTT, invNTT, mulcache, basemul, \ -and to/from bytes conversions." -#endif - -/************************************************* - * Name: poly_permute_bitrev_to_custom - * - * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined, - * convert a polynomial in NTT domain from bitreversed - * order to the custom order output by the native NTT. - * - * This must only be defined if there is native code for - * all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache. - * Arguments: - poly *p: pointer to in/output polynomial - * - **************************************************/ -static INLINE void poly_permute_bitrev_to_custom(poly *); -#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ - -#if defined(MLKEM_USE_NATIVE_INTT) -/************************************************* - * Name: intt_native - * - * Description: Computes inverse of negacyclic number-theoretic transform (NTT) - * of a polynomial in place. - * - * The input polynomial is in bitreversed order, or of a - * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * The output polynomial is assumed to be in normal order. - * - * Arguments: - uint16_t *a: pointer to in/output polynomial - **************************************************/ -static INLINE void intt_native(poly *); -#endif /* MLKEM_USE_NATIVE_INTT */ - -#if defined(MLKEM_USE_NATIVE_POLY_REDUCE) -/************************************************* - * Name: poly_reduce_native - * - * Description: Applies modular reduction to all coefficients of a polynomial. - * - * Arguments: - poly *r: pointer to input/output polynomial - **************************************************/ -static INLINE void poly_reduce_native(poly *); -#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ - -#if defined(MLKEM_USE_NATIVE_POLY_TOMONT) -/************************************************* - * Name: poly_tomont_native - * - * Description: Inplace conversion of all coefficients of a polynomial - * from normal domain to Montgomery domain - * - * Arguments: - poly *r: pointer to input/output polynomial - **************************************************/ -static INLINE void poly_tomont_native(poly *); -#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ - -#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) -/************************************************* - * Name: poly_mulcache_compute_native - * - * Description: Compute multiplication cache for a polynomial - * in NTT domain. - * - * The purpose of the multiplication cache is to - * cache repeated computations required during a - * base multiplication of polynomials in NTT domain. - * The structure of the multiplication-cache is - * implementation defined. - * - * Arguments: INPUT: - * - poly: const pointer to input polynomial. - * This must be in NTT domain and inin bitreversed order, or of - * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * OUTPUT - * - cache: pointer to multiplication cache - **************************************************/ -static INLINE void poly_mulcache_compute_native(poly_mulcache *cache, - const poly *poly); -#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ - -#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) -/************************************************* - * Name: poly_mulcache_compute_native - * - * Description: Compute multiplication of polynomials in NTT domain. - * - * Arguments: INPUT: - * - a: First polynomial operand. - * This must be in NTT domain and inin bitreversed order, or of - * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * - b: Second polynomial operand. - * As for a. - * - b_cache: Multiplication-cache for b. - * OUTPUT - * - r: Result of the base multiplication. This is again - * in NTT domain, and of the same order as a and b. - **************************************************/ -static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache); -#endif - -#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES) -/************************************************* - * Name: poly_tobytes_native - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - a: const pointer to input polynomial, - * with each coefficient in the range -Q+1 .. Q-1 - * OUTPUT - * - r: pointer to output byte array - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a); -#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ - -#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -/************************************************* - * Name: poly_frombytes_native - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - r: pointer to output polynomial in NTT domain - * OUTPUT - * - a: const pointer to input byte aray - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -static INLINE void poly_frombytes_native(poly *a, - const uint8_t r[MLKEM_POLYBYTES]); -#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ - -#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM) -/************************************************* - * Name: rej_uniform_native - * - * Description: Run rejection sampling on uniform random bytes to generate - * uniform random integers mod q - * - * Arguments: - int16_t *r: pointer to output buffer - * - unsigned int len: requested number of 16-bit integers - * (uniform mod q). - * - const uint8_t *buf: pointer to input buffer - * (assumed to be uniform random bytes) - * - unsigned int buflen: length of input buffer in bytes. - * - * Return -1 if the native implementation does not support the input lengths. - * Otherwise, returns non-negative number of sampled 16-bit integers (at most - * len). - **************************************************/ -static INLINE int rej_uniform_native(int16_t *r, unsigned int len, - const uint8_t *buf, unsigned int buflen); -#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ - -#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/arith_backend.h index 0543b1bd1..ade31cda1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/arith_backend.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/arith_backend.h @@ -17,7 +17,7 @@ * Keep this _after_ the inclusion of the backend; otherwise, * the sanity checks won't have an effect. */ #if defined(MLKEM_NATIVE_CHECK_APIS) -#include "api.h" +#include "native/api.h" #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.c deleted file mode 100644 index 1e6b7c5d1..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED - -#include -#include "cbd.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) -#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) -/* End of static namespacing */ - -/************************************************* - * Name: load32_littleendian - * - * Description: load 4 bytes into a 32-bit integer - * in little-endian order - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x - **************************************************/ -static uint32_t load32_littleendian(const uint8_t x[4]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - r |= (uint32_t)x[3] << 24; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) - { - unsigned j; - uint32_t t = load32_littleendian(buf + 4 * i); - uint32_t d = t & 0x55555555; - d += (t >> 1) & 0x55555555; - - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) - { - const int16_t a = (d >> (4 * j + 0)) & 0x3; - const int16_t b = (d >> (4 * j + 2)) & 0x3; - r->coeffs[8 * i + j] = a - b; - } - } -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -/************************************************* - * Name: load24_littleendian - * - * Description: load 3 bytes into a 32-bit integer - * in little-endian order. - * This function is only needed for ML-KEM-512 - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) - **************************************************/ -static uint32_t load24_littleendian(const uint8_t x[3]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 4; i++) - __loop__( - invariant(i <= MLKEM_N / 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) - { - unsigned j; - const uint32_t t = load24_littleendian(buf + 3 * i); - uint32_t d = t & 0x00249249; - d += (t >> 1) & 0x00249249; - d += (t >> 2) & 0x00249249; - - for (j = 0; j < 4; j++) - __loop__( - invariant(i <= MLKEM_N / 4 && j <= 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) - { - const int16_t a = (d >> (6 * j + 0)) & 0x7; - const int16_t b = (d >> (6 * j + 3)) & 0x7; - r->coeffs[4 * i + j] = a - b; - } - } -} -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ - 3 */ - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd) -int empty_cu_cbd; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.h deleted file mode 100644 index 54c1f5b90..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef CBD_H -#define CBD_H - -#include -#include "common.h" -#include "poly.h" - -#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) -/************************************************* - * Name: poly_cbd2 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=2 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) -/************************************************* - * Name: poly_cbd3 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=3. - * This function is only needed for ML-KEM-512 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ - -#endif /* CBD_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h index 4f326333e..62ed53ab1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h @@ -15,12 +15,19 @@ #include "sys.h" /* Include backend metadata */ -#if defined(MLKEM_USE_NATIVE) -#if defined(MLKEM_NATIVE_ARITH_BACKEND) -#include MLKEM_NATIVE_ARITH_BACKEND +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#include MLKEM_NATIVE_ARITH_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not. +#endif #endif -#if defined(MLKEM_NATIVE_FIPS202_BACKEND) -#include MLKEM_NATIVE_FIPS202_BACKEND + +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#include MLKEM_NATIVE_FIPS202_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not. #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/compress.c new file mode 100644 index 000000000..a03fe0ac4 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/compress.c @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "common.h" +#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) + +#include +#include +#include "arith_backend.h" +#include "cbmc.h" +#include "compress.h" +#include "debug.h" +#include "verify.h" + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 16))) + { + t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); + } + + r[i * 4] = t[0] | (t[1] << 4); + r[i * 4 + 1] = t[2] | (t[3] << 4); + r[i * 4 + 2] = t[4] | (t[5] << 4); + r[i * 4 + 3] = t[6] | (t[7] << 4); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + for (j = 0; j < MLKEM_N / 4; j++) + __loop__(invariant(j <= MLKEM_N / 4)) + { + unsigned k; + uint16_t t[4]; + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(forall(r, 0, k, t[r] < (1u << 10)))) + { + t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 10-bit in size. + */ + r[5 * j + 0] = (t[0] >> 0) & 0xFF; + r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); + r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); + r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); + r[5 * j + 4] = (t[3] >> 2); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) + { + r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); + r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 4; j++) + __loop__( + invariant(j <= MLKEM_N / 4) + invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[4]; + uint8_t const *base = &a[5 * j]; + + t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); + t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); + t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); + + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) + { + r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 32))) + { + t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); + } + + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC, and use array indexing into + * r rather than pointer-arithmetic to simplify verification + */ + r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); + r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); + r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); + r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); + r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (j = 0; j < MLKEM_N / 8; j++) + __loop__(invariant(j <= MLKEM_N / 8)) + { + unsigned k; + uint16_t t[8]; + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(forall(r, 0, k, t[r] < (1u << 11)))) + { + t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 11-bit in size. + */ + r[11 * j + 0] = (t[0] >> 0) & 0xFF; + r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); + r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); + r[11 * j + 3] = (t[2] >> 2) & 0xFF; + r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); + r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); + r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); + r[11 * j + 7] = (t[5] >> 1) & 0xFF; + r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); + r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); + r[11 * j + 10] = (t[7] >> 3); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + uint8_t t[8]; + const unsigned offset = i * 5; + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC and unwind loop for ease + * of proof. + */ + + /* + * Decompress 5 8-bit bytes (so 40 bits) into + * 8 5-bit values stored in t[] + */ + t[0] = 0x1F & (a[offset + 0] >> 0); + t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); + t[2] = 0x1F & (a[offset + 1] >> 2); + t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); + t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); + t[5] = 0x1F & (a[offset + 3] >> 1); + t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); + t[7] = 0x1F & (a[offset + 4] >> 3); + + /* and copy to the correct slice in r[] */ + for (j = 0; j < 8; j++) + __loop__( + invariant(j <= 8 && i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 8; j++) + __loop__( + invariant(j <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[8]; + uint8_t const *base = &a[11 * j]; + t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); + t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | + ((uint16_t)base[4] << 10)); + t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); + t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); + t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | + ((uint16_t)base[8] << 9)); + t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); + t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); + + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) + { + r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ + +#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 2; i++) + __loop__(invariant(i <= MLKEM_N / 2)) + { + const uint16_t t0 = a->coeffs[2 * i]; + const uint16_t t1 = a->coeffs[2 * i + 1]; + /* + * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of + * significant data, so these can be packed into 24 bits or exactly + * 3 bytes, as follows. + */ + + /* Least significant bits 0 - 7 of t0. */ + r[3 * i + 0] = t0 & 0xFF; + + /* + * Most significant bits 8 - 11 of t0 become the least significant + * nibble of the second byte. The least significant 4 bits + * of t1 become the upper nibble of the second byte. + */ + r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + + /* Bits 4 - 11 of t1 become the third byte. */ + r[3 * i + 2] = t1 >> 4; + } +} +#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + poly_tobytes_native(r, a->coeffs); +} +#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + +#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) + { + const uint8_t t0 = a[3 * i + 0]; + const uint8_t t1 = a[3 * i + 1]; + const uint8_t t2 = a[3 * i + 2]; + r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); + r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); + } + + /* Note that the coefficients are not canonical */ + debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); +} +#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + poly_frombytes_native(r->coeffs, a); +} +#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ + +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +{ + unsigned i; +#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) +#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" +#endif + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + for (j = 0; j < 8; j++) + __loop__( + invariant(i < MLKEM_N / 8 && j <= 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + /* Prevent the compiler from recognizing this as a bit selection */ + uint8_t mask = value_barrier_u8(1u << j); + r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); + } + } + debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + msg[i] = 0; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8)) + { + uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); + msg[i] |= t << j; + } + } +} + +#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ + +#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress) +int empty_cu_compress; + +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/compress.h new file mode 100644 index 000000000..409dbe519 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/compress.h @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef COMPRESS_H +#define COMPRESS_H + +#include +#include +#include "cbmc.h" +#include "common.h" +#include "debug.h" +#include "poly.h" +#include "verify.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) +#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) +#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) +#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) +#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) +#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) +#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) +#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) +#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) +/* End of static namespacing */ + +/************************************************************ + * Name: scalar_compress_d1 + * + * Description: Computes round(u * 2 / q) + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 1. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d1(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 2) + ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) +{ + uint32_t d0 = u << 1; + d0 *= 645083; + d0 += 1u << 30; + d0 >>= 31; + return d0; +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_compress_d4 + * + * Description: Computes round(u * 16 / q) % 16 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d4(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 16) + ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) +{ + uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ + return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d4 + * + * Description: Computes round(u * q / 16) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d4(uint32_t u) +__contract__( + requires(0 <= u && u < 16) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 8) / 16; } + +/************************************************************ + * Name: scalar_compress_d5 + * + * Description: Computes round(u * 32 / q) % 32 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d5(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 32) + ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) +{ + uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ + return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d5 + * + * Description: Computes round(u * q / 32) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo 32 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d5(uint32_t u) +__contract__( + requires(0 <= u && u < 32) + ensures(return_value <= MLKEM_Q - 1) +) { return ((u * MLKEM_Q) + 16) / 32; } + +/************************************************************ + * Name: scalar_compress_d10 + * + * Description: Computes round(u * 2**10 / q) % 2**10 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d10(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 10)) + ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) +{ + uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x3FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d10 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d10(uint32_t u) +__contract__( + requires(0 <= u && u < 1024) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 512) / 1024; } + +/************************************************************ + * Name: scalar_compress_d11 + * + * Description: Computes round(u * 2**11 / q) % 2**11 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 11. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d11(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 11)) + ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) +{ + uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x7FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d11 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d11(uint32_t u) +__contract__( + requires(0 <= u && u < 2048) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 1024) / 2048; } + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ + (MLKEM_K == 2 || MLKEM_K == 3) +#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) +/************************************************* + * Name: poly_compress_d4 + * + * Description: Compression (4 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); + +#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) +/************************************************* + * Name: poly_compress_d10 + * + * Description: Compression (10 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); + +#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) +/************************************************* + * Name: poly_decompress_d4 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); + +#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) +/************************************************* + * Name: poly_decompress_d10 + * + * Description: De-serialization and subsequent decompression (10 bits) of a + * polynomial; approximate inverse of poly_compress_d10 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) +/************************************************* + * Name: poly_compress_d5 + * + * Description: Compression (5 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); + +#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) +/************************************************* + * Name: poly_compress_d11 + * + * Description: Compression (11 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); + +#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) +/************************************************* + * Name: poly_decompress_d5 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); + +#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) +/************************************************* + * Name: poly_decompress_d11 + * + * Description: De-serialization and subsequent decompression (11 bits) of a + * polynomial; approximate inverse of poly_compress_d11 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ + */ + +#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) +/************************************************* + * Name: poly_tobytes + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - a: const pointer to input polynomial, + * with each coefficient in the range [0,1,..,Q-1] + * OUTPUT + * - r: pointer to output byte array + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYBYTES)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(r)) +); + + +#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) +/************************************************* + * Name: poly_frombytes + * + * Description: De-serialization of a polynomial. + * + * Arguments: INPUT + * - a: pointer to input byte array + * (of MLKEM_POLYBYTES bytes) + * OUTPUT + * - r: pointer to output polynomial, with + * each coefficient unsigned and in the range + * 0 .. 4095 + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) +); + + +#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) +/************************************************* + * Name: poly_frommsg + * + * Description: Convert 32-byte message to polynomial + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *msg: pointer to input message + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(object_whole(r)) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) +); + +#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) +/************************************************* + * Name: poly_tomsg + * + * Description: Convert polynomial to 32-byte message + * + * Arguments: - uint8_t *msg: pointer to output message + * - const poly *r: pointer to input polynomial + * Coefficients must be unsigned canonical + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(msg)) +); + +#endif /* COMPRESS_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h index fa89370ce..e975ede95 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h @@ -122,46 +122,87 @@ /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ /****************************************************************************** - * Name: MLKEM_USE_NATIVE + * Name: MLKEM_USE_NATIVE_BACKEND_ARITH * - * Description: Determines whether a native backend should - * be used, if available. + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. * * This can also be set using CFLAGS. * *****************************************************************************/ -#if !defined(MLKEM_USE_NATIVE) -/* #define MLKEM_USE_NATIVE */ +#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */ #endif /****************************************************************************** - * Name: MLKEM_NATIVE_ARITH_BACKEND + * Name: MLKEM_NATIVE_ARITH_BACKEND_FILE * * Description: The arithmetic backend to use. * - * This must be the filename of an arithmetic backend. - * See the existing backends for examples. + * If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND) -#define MLKEM_NATIVE_ARITH_BACKEND "default.h" -#endif /* MLKEM_NATIVE_ARITH_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \ + !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h" +#endif /****************************************************************************** - * Name: MLKEM_NATIVE_FIPS202_BACKEND + * Name: MLKEM_USE_NATIVE_BACKEND_FIPS202 + * + * Description: Determines whether an native FIPS202 backend should be used. + * + * The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is + * the performance bottleneck of SHA3 and SHAKE. + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the FIPS202 backend to be use is + * determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */ +#endif + +/****************************************************************************** + * Name: MLKEM_NATIVE_FIPS202_BACKEND_FILE * * Description: The FIPS-202 backend to use. * - * This must be the filename of an FIPS-202 backend. + * If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must + * either be undefined or the filename of a FIPS202 backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND) -#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h" -#endif /* MLKEM_NATIVE_FIPS202_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \ + !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h" +#endif /************************* Config internals ********************************/ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/default.h deleted file mode 100644 index d1e41c52e..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/default.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H -#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H - -/* - * Default arithmetic backend - */ -#include "sys.h" - -#ifdef SYS_AARCH64 -/* - * For AArch64, we currently we have one clean and one opt profile. - * We default to the opt profile. - * - * In the future, this may branch further depending on the microarchitecture. - */ -#include "aarch64/opt.h" -#endif /* SYS_AARCH64 */ - -#ifdef SYS_X86_64_AVX2 -/* - * For now, there's only one x86_64 profile, based on - * the AVX2 code from the Kyber repository. - * https://github.com/pq-crystals/kyber - */ -#include "x86_64/default.h" -#endif /* SYS_X86_64 */ - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.c index 0cfcc3e9e..318d0fc77 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.c @@ -9,11 +9,10 @@ #include "fips202.h" #include "fips202x4.h" #include "indcpa.h" -#include "ntt.h" #include "poly.h" -#include "polyvec.h" +#include "poly_k.h" #include "randombytes.h" -#include "rej_uniform.h" +#include "sampling.h" #include "symmetric.h" #include "arith_backend.h" @@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v, #define poly_permute_bitrev_to_custom \ MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom) -static INLINE void poly_permute_bitrev_to_custom(poly *data) +static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N]) __contract__( /* We don't specify that this should be a permutation, but only * that it does not change the bound established at the end of gen_matrix. */ - requires(memory_no_alias(data, sizeof(poly))) - requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N)) + requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q)) assigns(memory_slice(data, sizeof(poly))) - ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } + ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ /* Not static for benchmarking */ @@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed) { for (j = 0; j < MLKEM_K; j++) { - poly_permute_bitrev_to_custom(&a[i].vec[j]); + poly_permute_bitrev_to_custom(a[i].vec[j].coeffs); } } } diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.h index 2c4fda3c4..b4d5985bf 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.h @@ -8,7 +8,7 @@ #include #include "cbmc.h" #include "common.h" -#include "polyvec.h" +#include "poly_k.h" #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix) /************************************************* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/README.md similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/README.md rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/README.md diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/clean.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/clean.h similarity index 90% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/clean.h rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/clean.h index 43a401dfc..f124702a4 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/clean.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/clean.h @@ -19,6 +19,6 @@ /* Filename of the C backend implementation. * This is not inlined here because this header is included in assembly * files as well. */ -#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/clean_impl.h" +#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/clean_impl.h" #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/opt.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/opt.h similarity index 91% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/opt.h rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/opt.h index 04323c3e7..a7217163f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/opt.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/opt.h @@ -19,6 +19,6 @@ /* Filename of the C backend implementation. * This is not inlined here because this header is included in assembly * files as well. */ -#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/opt_impl.h" +#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/opt_impl.h" #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c index 1e189fd99..b3a6f198f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c @@ -8,7 +8,7 @@ * Do not modify it directly. */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/arith_native_aarch64.h similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/arith_native_aarch64.h rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/arith_native_aarch64.h index fc4e7dd38..a784a3027 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/arith_native_aarch64.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/arith_native_aarch64.h @@ -6,7 +6,7 @@ #define MLKEM_AARCH64_NATIVE_H #include -#include "common.h" +#include "../../../common.h" #define aarch64_ntt_zetas_layer01234 \ MLKEM_NAMESPACE(aarch64_ntt_zetas_layer01234) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/clean_impl.h similarity index 58% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/clean_impl.h rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/clean_impl.h index 548b1eebb..ded7d067a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/clean_impl.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/clean_impl.h @@ -12,9 +12,6 @@ #include "arith_native_aarch64.h" -#include "poly.h" -#include "polyvec.h" - /* Set of primitives that this backend replaces */ #define MLKEM_USE_NATIVE_NTT #define MLKEM_USE_NATIVE_INTT @@ -25,45 +22,46 @@ #define MLKEM_USE_NATIVE_POLY_TOBYTES #define MLKEM_USE_NATIVE_REJ_UNIFORM -static INLINE void ntt_native(poly *data) +static INLINE void ntt_native(int16_t data[MLKEM_N]) { - ntt_asm_clean(data->coeffs, aarch64_ntt_zetas_layer01234, - aarch64_ntt_zetas_layer56); + ntt_asm_clean(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56); } -static INLINE void intt_native(poly *data) +static INLINE void intt_native(int16_t data[MLKEM_N]) { - intt_asm_clean(data->coeffs, aarch64_invntt_zetas_layer01234, + intt_asm_clean(data, aarch64_invntt_zetas_layer01234, aarch64_invntt_zetas_layer56); } -static INLINE void poly_reduce_native(poly *data) +static INLINE void poly_reduce_native(int16_t data[MLKEM_N]) { - poly_reduce_asm_clean(data->coeffs); + poly_reduce_asm_clean(data); } -static INLINE void poly_tomont_native(poly *data) + +static INLINE void poly_tomont_native(int16_t data[MLKEM_N]) { - poly_tomont_asm_clean(data->coeffs); + poly_tomont_asm_clean(data); } -static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y) +static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2], + const int16_t y[MLKEM_N]) { - poly_mulcache_compute_asm_clean(x->coeffs, y->coeffs, - aarch64_zetas_mulcache_native, + poly_mulcache_compute_asm_clean(x, y, aarch64_zetas_mulcache_native, aarch64_zetas_mulcache_twisted_native); } + static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache) + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]) { - polyvec_basemul_acc_montgomery_cached_asm_clean( - r->coeffs, a->vec[0].coeffs, b->vec[0].coeffs, b_cache->vec[0].coeffs); + polyvec_basemul_acc_montgomery_cached_asm_clean(r, a, b, b_cache); } static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a) + const int16_t a[MLKEM_N]) { - poly_tobytes_asm_clean(r, a->coeffs); + poly_tobytes_asm_clean(r, a); } static INLINE int rej_uniform_native(int16_t *r, unsigned int len, diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/consts.h similarity index 94% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/consts.h rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/consts.h index c40947299..e3ea26a27 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/consts.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/consts.h @@ -7,7 +7,7 @@ #define MLKEM_NATIVE_AARCH64_CONSTS #include -#include "common.h" +#include "../../../common.h" #define zetas_mulcache_native MLKEM_NAMESPACE(zetas_mulcache_native) extern const int16_t zetas_mulcache_native[256]; diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S index b243a569d..28ad38975 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S @@ -23,7 +23,7 @@ /// SOFTWARE. /// -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) // Bounds: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S index c94746e17..857c729cb 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S @@ -23,7 +23,7 @@ /// SOFTWARE. /// -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) // Bounds: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S index cd63cc4d6..30fdc76b0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S @@ -24,7 +24,7 @@ /// SOFTWARE. /// -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) // Bounds: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_opt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_opt.S index 8705615b7..431f9dc6f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_opt.S @@ -24,7 +24,7 @@ /// SOFTWARE. /// -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) // Bounds: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/opt_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/opt_impl.h similarity index 58% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/opt_impl.h rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/opt_impl.h index ec1bf6587..eb8e39ed0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/opt_impl.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/opt_impl.h @@ -10,11 +10,9 @@ #else #define MLKEM_NATIVE_ARITH_PROFILE_IMPL_H +#include "../../../params.h" #include "arith_native_aarch64.h" -#include "poly.h" -#include "polyvec.h" - /* Set of primitives that this backend replaces */ #define MLKEM_USE_NATIVE_NTT #define MLKEM_USE_NATIVE_INTT @@ -25,45 +23,46 @@ #define MLKEM_USE_NATIVE_POLY_TOBYTES #define MLKEM_USE_NATIVE_REJ_UNIFORM -static INLINE void ntt_native(poly *data) +static INLINE void ntt_native(int16_t data[MLKEM_N]) { - ntt_asm_opt(data->coeffs, aarch64_ntt_zetas_layer01234, - aarch64_ntt_zetas_layer56); + ntt_asm_opt(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56); } -static INLINE void intt_native(poly *data) +static INLINE void intt_native(int16_t data[MLKEM_N]) { - intt_asm_opt(data->coeffs, aarch64_invntt_zetas_layer01234, + intt_asm_opt(data, aarch64_invntt_zetas_layer01234, aarch64_invntt_zetas_layer56); } -static INLINE void poly_reduce_native(poly *data) +static INLINE void poly_reduce_native(int16_t data[MLKEM_N]) { - poly_reduce_asm_opt(data->coeffs); + poly_reduce_asm_opt(data); } -static INLINE void poly_tomont_native(poly *data) + +static INLINE void poly_tomont_native(int16_t data[MLKEM_N]) { - poly_tomont_asm_opt(data->coeffs); + poly_tomont_asm_opt(data); } -static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y) +static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2], + const int16_t y[MLKEM_N]) { - poly_mulcache_compute_asm_opt(x->coeffs, y->coeffs, - aarch64_zetas_mulcache_native, + poly_mulcache_compute_asm_opt(x, y, aarch64_zetas_mulcache_native, aarch64_zetas_mulcache_twisted_native); } + static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache) + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]) { - polyvec_basemul_acc_montgomery_cached_asm_opt( - r->coeffs, a->vec[0].coeffs, b->vec[0].coeffs, b_cache->vec[0].coeffs); + polyvec_basemul_acc_montgomery_cached_asm_opt(r, a, b, b_cache); } static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a) + const int16_t a[MLKEM_N]) { - poly_tobytes_asm_opt(r, a->coeffs); + poly_tobytes_asm_opt(r, a); } static INLINE int rej_uniform_native(int16_t *r, unsigned int len, diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/optimize.sh b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/optimize.sh similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/optimize.sh rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/optimize.sh diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_clean.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_clean.S index 809f9667e..f3ee0796f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_clean.S @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) /* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S index 815a9dd1a..555c60a67 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) /* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S index c91675b44..0b6df6345 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S @@ -9,7 +9,7 @@ // https://eprint.iacr.org/2021/986 // https://github.com/neon-ntt/neon-ntt -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) // Input: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S index 8300b682c..7a27fda3e 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S @@ -9,7 +9,7 @@ // https://eprint.iacr.org/2021/986 // https://github.com/neon-ntt/neon-ntt -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) // Input: diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S index 5151a05d0..9158d6c82 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S @@ -18,7 +18,7 @@ * * Returns number of sampled 16-bit integers (at most MLKEM_N). **************************************************/ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_table.c rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c index 507660349..29cdbe95f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_table.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c @@ -8,7 +8,7 @@ * Do not modify it directly. */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/api.h new file mode 100644 index 000000000..0704f9dcd --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/api.h @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * Native arithmetic interface + * + * This header is primarily for documentation purposes. + * It should not be included by backend implementations. + * + * To ensure consistency with backends, the header will be + * included automatically after inclusion of the active + * backend, to ensure consistency of function signatures, + * and run sanity checks. + */ +#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H +#error \ + "The arithmetic backend API `mlkem/native/api.h` " \ + "should not be directly included. Please include the relevant " \ + "structure headers directly." +#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ +#define MLKEM_NATIVE_ARITH_NATIVE_API_H + +#include +#include "../common.h" + +/* + * This is the C<->native interface allowing for the drop-in of + * native code for performance critical arithmetic components of ML-KEM. + * + * A _backend_ is a specific implementation of (part of) this interface. + * + * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and + * implement `static inline xxx(...)` in the profile header. + * + * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can + * be set if there are native implementations for all of NTT, invNTT, and + * base multiplication, and allows the native implementation to use a + * custom order of polynomial coefficients in NTT domain -- the use of such + * custom order is not an implementation-detail since the public matrix + * is generated in NTT domain. In this case, a permutation function + * poly_permute_bitrev_to_custom() needs to be provided that permutes + * polynomials in NTT domain from bitreversed to the custom order. + */ + +/* + * Those functions are meant to be trivial wrappers around the chosen native + * implementation. The are static inline to avoid unnecessary calls. + * The macro before each declaration controls whether a native + * implementation is present. + */ + +#if defined(MLKEM_USE_NATIVE_NTT) +/************************************************* + * Name: ntt_native + * + * Description: Computes negacyclic number-theoretic transform (NTT) of + * a polynomial in place. + * + * The input polynomial is assumed to be in normal order. + * The output polynomial is in bitreversed order, or of a + * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial + **************************************************/ +static INLINE void ntt_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_NTT */ + +#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER) +/* + * This must only be set if NTT, invNTT, basemul, mulcache, and + * to/from byte stream conversions all have native implementations + * that are adapted to the custom order. + */ +#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) || \ + !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) || \ + !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +#error \ + "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \ +set if there are native implementations for NTT, invNTT, mulcache, basemul, \ +and to/from bytes conversions." +#endif + +/************************************************* + * Name: poly_permute_bitrev_to_custom + * + * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined, + * convert a polynomial in NTT domain from bitreversed + * order to the custom order output by the native NTT. + * + * This must only be defined if there is native code for + * all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache. + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial + * + **************************************************/ +static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ + +#if defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: intt_native + * + * Description: Computes inverse of negacyclic number-theoretic transform (NTT) + * of a polynomial in place. + * + * The input polynomial is in bitreversed order, or of a + * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * The output polynomial is assumed to be in normal order. + * + * Arguments: - uint16_t *a: pointer to in/output polynomial + **************************************************/ +static INLINE void intt_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_INTT */ + +#if defined(MLKEM_USE_NATIVE_POLY_REDUCE) +/************************************************* + * Name: poly_reduce_native + * + * Description: Applies modular reduction to all coefficients of a polynomial. + * + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial + **************************************************/ +static INLINE void poly_reduce_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ + +#if defined(MLKEM_USE_NATIVE_POLY_TOMONT) +/************************************************* + * Name: poly_tomont_native + * + * Description: Inplace conversion of all coefficients of a polynomial + * from normal domain to Montgomery domain + * + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial + **************************************************/ +static INLINE void poly_tomont_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ + +#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) +/************************************************* + * Name: poly_mulcache_compute_native + * + * Description: Compute multiplication cache for a polynomial + * in NTT domain. + * + * The purpose of the multiplication cache is to + * cache repeated computations required during a + * base multiplication of polynomials in NTT domain. + * The structure of the multiplication-cache is + * implementation defined. + * + * Arguments: INPUT: + * - poly: const pointer to input polynomial. + * This must be in NTT domain and inin bitreversed order, or of + * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * OUTPUT + * - cache: pointer to multiplication cache + **************************************************/ +static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2], + const int16_t poly[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ + +#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) +/************************************************* + * Name: poly_mulcache_compute_native + * + * Description: Compute multiplication of polynomials in NTT domain. + * + * Arguments: INPUT: + * - a: First polynomial operand. + * This must be in NTT domain and inin bitreversed order, or of + * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * - b: Second polynomial operand. + * As for a. + * - b_cache: Multiplication-cache for b. + * OUTPUT + * - r: Result of the base multiplication. This is again + * in NTT domain, and of the same order as a and b. + **************************************************/ +static INLINE void polyvec_basemul_acc_montgomery_cached_native( + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]); +#endif + +#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES) +/************************************************* + * Name: poly_tobytes_native + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - a: const pointer to input polynomial, + * with each coefficient in the range -Q+1 .. Q-1 + * OUTPUT + * - r: pointer to output byte array + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], + const int16_t a[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + +#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +/************************************************* + * Name: poly_frombytes_native + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - r: pointer to output polynomial in NTT domain + * OUTPUT + * - a: const pointer to input byte aray + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +static INLINE void poly_frombytes_native(int16_t a[MLKEM_N], + const uint8_t r[MLKEM_POLYBYTES]); +#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ + +#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM) +/************************************************* + * Name: rej_uniform_native + * + * Description: Run rejection sampling on uniform random bytes to generate + * uniform random integers mod q + * + * Arguments: - int16_t *r: pointer to output buffer + * - unsigned int len: requested number of 16-bit integers + * (uniform mod q). + * - const uint8_t *buf: pointer to input buffer + * (assumed to be uniform random bytes) + * - unsigned int buflen: length of input buffer in bytes. + * + * Return -1 if the native implementation does not support the input lengths. + * Otherwise, returns non-negative number of sampled 16-bit integers (at most + * len). + **************************************************/ +static INLINE int rej_uniform_native(int16_t *r, unsigned int len, + const uint8_t *buf, unsigned int buflen); +#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ + +#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/default.h new file mode 100644 index 000000000..f9fe4310a --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/default.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H +#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H + +/* + * Default arithmetic backend + */ +#include "../sys.h" + +#ifdef SYS_AARCH64 +/* + * For AArch64, we currently we have one clean and one opt profile. + * We default to the opt profile. + * + * In the future, this may branch further depending on the microarchitecture. + */ +#include "aarch64/opt.h" +#endif /* SYS_AARCH64 */ + +#ifdef SYS_X86_64_AVX2 +/* + * For now, there's only one x86_64 profile, based on + * the AVX2 code from the Kyber repository. + * https://github.com/pq-crystals/kyber + */ +#include "x86_64/default.h" +#endif /* SYS_X86_64 */ + +#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.c deleted file mode 100644 index 3651c8da9..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) - -#include -#include "arith_backend.h" -#include "debug.h" -#include "ntt.h" -#include "reduce.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) -#define ntt_layer MLKEM_NAMESPACE(ntt_layer) -#define invntt_layer MLKEM_NAMESPACE(invntt_layer) -/* End of static namespacing */ - -#if !defined(MLKEM_USE_NATIVE_NTT) -/* - * Computes a block CT butterflies with a fixed twiddle factor, - * using Montgomery multiplication. - * Parameters: - * - r: Pointer to base of polynomial (_not_ the base of butterfly block) - * - root: Twiddle factor to use for the butterfly. This must be in - * Montgomery form and signed canonical. - * - start: Offset to the beginning of the butterfly block - * - len: Index difference between coefficients subject to a butterfly - * - bound: Ghost variable describing coefficient bound: Prior to `start`, - * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, - * they must be bound by `bound`. - * When this function returns, output coefficients in the index range - * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. - * Example: - * - start=8, len=4 - * This would compute the following four butterflies - * 8 -- 12 - * 9 -- 13 - * 10 -- 14 - * 11 -- 15 - * - start=4, len=2 - * This would compute the following two butterflies - * 4 -- 6 - * 5 -- 7 - */ -static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, - unsigned start, unsigned len, int bound) -__contract__( - requires(start < MLKEM_N) - requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) - requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) - requires(-HALF_Q < zeta && zeta < HALF_Q) - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) - requires(array_abs_bound(r, start, MLKEM_N, bound)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) - ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) -{ - /* `bound` is a ghost variable only needed in the CBMC specification */ - unsigned j; - ((void)bound); - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - /* - * Coefficients are updated in strided pairs, so the bounds for the - * intermediate states alternate twice between the old and new bound - */ - invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j, start + len, bound)) - invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) - { - int16_t t; - t = fqmul(r[j + len], zeta); - r[j + len] = r[j] - t; - r[j] = r[j] + t; - } -} - -/* - *Compute one layer of forward NTT - * Parameters: - * - r: Pointer to base of polynomial - * - len: Stride of butterflies in this layer. - * - layer: Ghost variable indicating which layer is being applied. - * Must match `len` via `len == MLKEM_N >> layer`. - * Note: `len` could be dropped and computed in the function, but - * we are following the structure of the reference NTT from the - * official Kyber implementation here, merely adding `layer` as - * a ghost variable for the specifications. - */ -static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) - requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable only needed in the CBMC specification */ - ((void)layer); - /* Twiddle factors for layer n start at index 2^(layer-1) */ - k = MLKEM_N / (2 * len); - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(start < MLKEM_N + 2 * len) - invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) - invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) - invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) - { - int16_t zeta = zetas[k++]; - ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); - } -} - -/* - * Compute full forward NTT - * NOTE: This particular implementation satisfies a much tighter - * bound on the output coefficients (5*q) than the contractual one (8*q), - * but this is not needed in the calling code. Should we change the - * base multiplication strategy to require smaller NTT output bounds, - * the proof may need strengthening. - */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - unsigned len, layer; - int16_t *r; - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - r = p->coeffs; - - for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) - __loop__( - invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) - invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) - { - ntt_layer(r, len, layer); - } - - /* Check the stronger bound */ - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_NTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - ntt_native(p); - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_NTT */ - -#if !defined(MLKEM_USE_NATIVE_INTT) - -/* Compute one layer of inverse NTT */ -static void invntt_layer(int16_t *r, unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) - requires(len == (1 << (8 - layer))) - requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable used only in the specification */ - ((void)layer); - k = MLKEM_N / len - 1; - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - invariant(start <= MLKEM_N && k <= 127) - /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ - invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) - { - unsigned j; - int16_t zeta = zetas[k--]; - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - invariant(start <= MLKEM_N && k <= 127) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - int16_t t = r[j]; - r[j] = barrett_reduce(t + r[j + len]); - r[j + len] = r[j + len] - t; - r[j + len] = fqmul(r[j + len], zeta); - } - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - /* - * Scale input polynomial to account for Montgomery factor - * and NTT twist. This also brings coefficients down to - * absolute value < MLKEM_Q. - */ - unsigned j, len, layer; - const int16_t f = 1441; - int16_t *r = p->coeffs; - - for (j = 0; j < MLKEM_N; j++) - __loop__( - invariant(j <= MLKEM_N) - invariant(array_abs_bound(r, 0, j, MLKEM_Q))) - { - r[j] = fqmul(r[j], f); - } - - /* Run the invNTT layers */ - for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) - __loop__( - invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - invntt_layer(p->coeffs, len, layer); - } - - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - intt_native(p); - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -{ - int32_t t0, t1; - debug_assert_bound(a, 2, 0, UINT12_LIMIT); - - t0 = (int32_t)a[1] * b_cached; - t0 += (int32_t)a[0] * b[0]; - t1 = (int32_t)a[0] * b[1]; - t1 += (int32_t)a[1] * b[0]; - - /* |ti| < 2 * q * 2^15 */ - r[0] = montgomery_reduce(t0); - r[1] = montgomery_reduce(t1); - - debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); -} - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt) -int empty_cu_ntt; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.h deleted file mode 100644 index 4e80d3ab3..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef NTT_H -#define NTT_H -#include "common.h" - -#include -#include "cbmc.h" -#include "poly.h" -#include "reduce.h" - -#define zetas MLKEM_NAMESPACE(zetas) -extern const int16_t zetas[128]; - -#define poly_ntt MLKEM_NAMESPACE(poly_ntt) -/************************************************* - * Name: poly_ntt - * - * Description: Computes negacyclic number-theoretic transform (NTT) of - * a polynomial in place. - * - * The input is assumed to be in normal order and - * coefficient-wise bound by MLKEM_Q in absolute value. - * - * The output polynomial is in bitreversed order, and - * coefficient-wise bound by NTT_BOUND in absolute value. - * - * (NOTE: Sometimes the input to the NTT is actually smaller, - * which gives better bounds.) - * - * Arguments: - poly *p: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) -); - -#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) -/************************************************* - * Name: poly_invntt_tomont - * - * Description: Computes inverse of negacyclic number-theoretic transform (NTT) - * of a polynomial in place; - * inputs assumed to be in bitreversed order, output in normal - * order - * - * The input is assumed to be in bitreversed order, and can - * have arbitrary coefficients in int16_t. - * - * The output polynomial is in normal order, and - * coefficient-wise bound by INVNTT_BOUND in absolute value. - * - * Arguments: - uint16_t *a: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) -); - -#define basemul_cached MLKEM_NAMESPACE(basemul_cached) -/************************************************************ - * Name: basemul_cached - * - * Description: Computes a representative modulo q of - * (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536 - * - * If b_cached is b1*zeta, this represents the - * product of (a0 + a1*X) and (b0 + b1*X) in - * Fq[X]/(X^2 - zeta). - * - * Arguments: - r: Pointer to output polynomial - * Upon return, coefficients are bound by - * 2*MLKEM_Q in absolute value. - * - a: Pointer to first input polynomial - * Every coefficient must be in [0..4095] - * - b: Pointer to second input polynomial - * Can have arbitrary int16_t coefficients - * - b_cached: Some precomputed value, typically derived from - * b1 and a twiddle factor. Can be an arbitary int16_t. - ************************************************************/ -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -__contract__( - requires(memory_no_alias(r, 2 * sizeof(int16_t))) - requires(memory_no_alias(a, 2 * sizeof(int16_t))) - requires(memory_no_alias(b, 2 * sizeof(int16_t))) - requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) - assigns(memory_slice(r, 2 * sizeof(int16_t))) - ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)) -); - -#endif /* NTT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/params.h index 57ea4c8ba..7f6c12625 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/params.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/params.h @@ -18,6 +18,7 @@ #define MLKEM_N 256 #define MLKEM_Q 3329 #define UINT12_LIMIT 4096 +#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define MLKEM_SSBYTES 32 /* size in bytes of shared key */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.c index 7483ebf6d..e8a2e2c6e 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.c @@ -8,388 +8,246 @@ #include #include #include "arith_backend.h" -#include "cbd.h" #include "cbmc.h" #include "debug.h" #include "fips202x4.h" -#include "ntt.h" #include "poly.h" -#include "reduce.h" +#include "sampling.h" #include "symmetric.h" #include "verify.h" -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 16))) - { - t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); - } - - r[i * 4] = t[0] | (t[1] << 4); - r[i * 4 + 1] = t[2] | (t[3] << 4); - r[i * 4 + 2] = t[4] | (t[5] << 4); - r[i * 4 + 3] = t[6] | (t[7] << 4); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - for (j = 0; j < MLKEM_N / 4; j++) - __loop__(invariant(j <= MLKEM_N / 4)) - { - unsigned k; - uint16_t t[4]; - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(forall(r, 0, k, t[r] < (1u << 10)))) - { - t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); - } - - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 10-bit in size. - */ - r[5 * j + 0] = (t[0] >> 0) & 0xFF; - r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); - r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); - r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); - r[5 * j + 4] = (t[3] >> 2); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) - { - r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); - r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) +#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) +#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) +#define fqmul MLKEM_NAMESPACE(fqmul) +#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) +#define basemul_cached MLKEM_NAMESPACE(basemul_cached) +#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) +#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) +#define ntt_layer MLKEM_NAMESPACE(ntt_layer) +#define invntt_layer MLKEM_NAMESPACE(invntt_layer) +/* End of static namespacing */ + +/************************************************* + * Name: cast_uint16_to_int16 + * + * Description: Cast uint16 value to int16 + * + * Returns: + * input x in 0 .. 32767: returns value unchanged + * input x in 32768 .. 65535: returns (x - 65536) + **************************************************/ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "conversion" +#endif +ALWAYS_INLINE +static INLINE int16_t cast_uint16_to_int16(uint16_t x) { - unsigned j; - for (j = 0; j < MLKEM_N / 4; j++) - __loop__( - invariant(j <= MLKEM_N / 4) - invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[4]; - uint8_t const *base = &a[5 * j]; - - t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); - t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); - t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); - - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) - { - r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + /* + * PORTABILITY: This relies on uint16_t -> int16_t + * being implemented as the inverse of int16_t -> uint16_t, + * which is implementation-defined (C99 6.3.1.3 (3)) + * CBMC (correctly) fails to prove this conversion is OK, + * so we have to suppress that check here + */ + return (int16_t)x; } -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ +#ifdef CBMC +#pragma CPROVER check pop +#endif -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +/************************************************* + * Name: montgomery_reduce_generic + * + * Description: Generic Montgomery reduction; given a 32-bit integer a, computes + * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 + * + * Arguments: - int32_t a: input integer to be reduced + * + * Returns: integer congruent to a * R^-1 modulo q, with absolute value + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 + * + **************************************************/ +ALWAYS_INLINE +static INLINE int16_t montgomery_reduce_generic(int32_t a) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ + const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 32))) - { - t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); - } + /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ + const uint16_t a_reduced = a & UINT16_MAX; + const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC, and use array indexing into - * r rather than pointer-arithmetic to simplify verification - */ - r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); - r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); - r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); - r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); - r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); - } -} + /* Lift to signed canonical representative mod 2^16. */ + const int16_t t = cast_uint16_to_int16(a_inverted); -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + int32_t r = a - ((int32_t)t * MLKEM_Q); + /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - for (j = 0; j < MLKEM_N / 8; j++) - __loop__(invariant(j <= MLKEM_N / 8)) - { - unsigned k; - uint16_t t[8]; - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(forall(r, 0, k, t[r] < (1u << 11)))) - { - t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); - } + /* + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + r = r >> 16; + /* Bounds: |r >> 16| <= ceil(|r| / 2^16) + * <= ceil(|a| / 2^16 + MLKEM_Q / 2) + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * + * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) + */ - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 11-bit in size. - */ - r[11 * j + 0] = (t[0] >> 0) & 0xFF; - r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); - r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); - r[11 * j + 3] = (t[2] >> 2) & 0xFF; - r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); - r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); - r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); - r[11 * j + 7] = (t[5] >> 1) & 0xFF; - r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); - r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); - r[11 * j + 10] = (t[7] >> 3); - } + return (int16_t)r; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +/************************************************* + * Name: montgomery_reduce + * + * Description: Montgomery reduction + * + * Arguments: - int32_t a: input integer to be reduced + * Must be smaller than 2 * 2^12 * 2^15 in absolute value. + * + * Returns: integer congruent to a * R^-1 modulo q, + * smaller than 2 * q in absolute value. + **************************************************/ +static INLINE int16_t montgomery_reduce(int32_t a) +__contract__( + requires(a > -(2 * UINT12_LIMIT * 32768)) + requires(a < (2 * UINT12_LIMIT * 32768)) + ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) +) { - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - uint8_t t[8]; - const unsigned offset = i * 5; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC and unwind loop for ease - * of proof. - */ - - /* - * Decompress 5 8-bit bytes (so 40 bits) into - * 8 5-bit values stored in t[] - */ - t[0] = 0x1F & (a[offset + 0] >> 0); - t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); - t[2] = 0x1F & (a[offset + 1] >> 2); - t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); - t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); - t[5] = 0x1F & (a[offset + 3] >> 1); - t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); - t[7] = 0x1F & (a[offset + 4] >> 3); - - /* and copy to the correct slice in r[] */ - for (j = 0; j < 8; j++) - __loop__( - invariant(j <= 8 && i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + int16_t res; + debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); + + res = montgomery_reduce_generic(a); + /* Bounds: + * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 + * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 + * < 2 * MLKEM_Q */ + + debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); + return res; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: fqmul + * + * Description: Montgomery multiplication modulo q=3329 + * + * Arguments: - int16_t a: first factor + * Can be any int16_t. + * - int16_t b: second factor. + * Must be signed canonical (abs value <(q+1)/2) + * + * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and + * smaller than q in absolute value. + * + **************************************************/ +static INLINE int16_t fqmul(int16_t a, int16_t b) +__contract__( + requires(b > -HALF_Q) + requires(b < HALF_Q) + ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) +) { - unsigned j; - for (j = 0; j < MLKEM_N / 8; j++) - __loop__( - invariant(j <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[8]; - uint8_t const *base = &a[11 * j]; - t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); - t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | - ((uint16_t)base[4] << 10)); - t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); - t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); - t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | - ((uint16_t)base[8] << 9)); - t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); - t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); - - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) - { - r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); - } - } + int16_t res; + debug_assert_abs_bound(&b, 1, HALF_Q); + + res = montgomery_reduce((int32_t)a * (int32_t)b); + /* Bounds: + * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 + * < MLKEM_Q + */ - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + debug_assert_abs_bound(&res, 1, MLKEM_Q); + return res; } -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ - -#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: barrett_reduce + * + * Description: Barrett reduction; given a 16-bit integer a, computes + * centered representative congruent to a mod q in + * {-(q-1)/2,...,(q-1)/2} + * + * Arguments: - int16_t a: input integer to be reduced + * + * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. + **************************************************/ +static INLINE int16_t barrett_reduce(int16_t a) +__contract__( + ensures(return_value > -HALF_Q && return_value < HALF_Q) +) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 2; i++) - __loop__(invariant(i <= MLKEM_N / 2)) - { - const uint16_t t0 = a->coeffs[2 * i]; - const uint16_t t1 = a->coeffs[2 * i + 1]; - /* - * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of - * significant data, so these can be packed into 24 bits or exactly - * 3 bytes, as follows. - */ - - /* Least significant bits 0 - 7 of t0. */ - r[3 * i + 0] = t0 & 0xFF; - - /* - * Most significant bits 8 - 11 of t0 become the least significant - * nibble of the second byte. The least significant 4 bits - * of t1 become the upper nibble of the second byte. - */ - r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + /* + * To divide by MLKEM_Q using Barrett multiplication, the "magic number" + * multiplier is round_to_nearest(2**26/MLKEM_Q) + */ + const int BPOWER = 26; + const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - /* Bits 4 - 11 of t1 become the third byte. */ - r[3 * i + 2] = t1 >> 4; - } -} -#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -{ - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - poly_tobytes_native(r, a); -} -#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + /* + * Compute round_to_nearest(a/MLKEM_Q) using the multiplier + * above and shift by BPOWER places. + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; -#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) - { - const uint8_t t0 = a[3 * i + 0]; - const uint8_t t1 = a[3 * i + 1]; - const uint8_t t2 = a[3 * i + 2]; - r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); - r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); - } + /* + * t is in -10 .. +10, so we need 32-bit math to + * evaluate t * MLKEM_Q and the subsequent subtraction + */ + int16_t res = (int16_t)(a - t * MLKEM_Q); - /* Note that the coefficients are not canonical */ - debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); -} -#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - poly_frombytes_native(r, a); + debug_assert_abs_bound(&res, 1, HALF_Q); + return res; } -#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ - -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], + int16_t b_cached) +__contract__( + requires(memory_no_alias(r, 2 * sizeof(int16_t))) + requires(memory_no_alias(a, 2 * sizeof(int16_t))) + requires(memory_no_alias(b, 2 * sizeof(int16_t))) + requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) + assigns(memory_slice(r, 2 * sizeof(int16_t))) + ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))) { - unsigned i; -#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) -#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" -#endif + int32_t t0, t1; + debug_assert_bound(a, 2, 0, UINT12_LIMIT); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - for (j = 0; j < 8; j++) - __loop__( - invariant(i < MLKEM_N / 8 && j <= 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - /* Prevent the compiler from recognizing this as a bit selection */ - uint8_t mask = value_barrier_u8(1u << j); - r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); - } - } - debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); -} + t0 = (int32_t)a[1] * b_cached; + t0 += (int32_t)a[0] * b[0]; + t1 = (int32_t)a[0] * b[1]; + t1 += (int32_t)a[1] * b[0]; -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* |ti| < 2 * q * 2^15 */ + r[0] = montgomery_reduce(t0); + r[1] = montgomery_reduce(t1); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - msg[i] = 0; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8)) - { - uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); - msg[i] |= t << j; - } - } + debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); } MLKEM_NATIVE_INTERNAL_API @@ -434,12 +292,46 @@ void poly_tomont(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_tomont(poly *r) { - poly_tomont_native(r); + poly_tomont_native(r->coeffs); debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) +/************************************************************ + * Name: scalar_signed_to_unsigned_q + * + * Description: converts signed polynomial coefficient + * from signed (-3328 .. 3328) form to + * unsigned form (0 .. 3328). + * + * Note: Cryptographic constant time implementation + * + * Examples: 0 -> 0 + * 1 -> 1 + * 3328 -> 3328 + * -1 -> 3328 + * -2 -> 3327 + * -3328 -> 1 + * + * Arguments: c: signed coefficient to be converted + ************************************************************/ +static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) +__contract__( + requires(c > -MLKEM_Q && c < MLKEM_Q) + ensures(return_value >= 0 && return_value < MLKEM_Q) + ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) +{ + debug_assert_abs_bound(&c, 1, MLKEM_Q); + + /* Add Q if c is negative, but in constant time */ + c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); + + /* and therefore cast to uint16_t is safe. */ + debug_assert_bound(&c, 1, 0, MLKEM_Q); + return (uint16_t)c; +} + MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { @@ -461,7 +353,7 @@ void poly_reduce(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { - poly_reduce_native(r); + poly_reduce_native(r->coeffs); debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ @@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a) MLKEM_NATIVE_INTERNAL_API void poly_mulcache_compute(poly_mulcache *x, const poly *a) { - poly_mulcache_compute_native(x, a); + poly_mulcache_compute_native(x->coeffs, a->coeffs); /* Omitting bounds assertion since native implementations may * decide not to use a mulcache. Note that the C backend implementation * of poly_basemul_montgomery_cached() does still include the check. */ } #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ +#if !defined(MLKEM_USE_NATIVE_NTT) +/* + * Computes a block CT butterflies with a fixed twiddle factor, + * using Montgomery multiplication. + * Parameters: + * - r: Pointer to base of polynomial (_not_ the base of butterfly block) + * - root: Twiddle factor to use for the butterfly. This must be in + * Montgomery form and signed canonical. + * - start: Offset to the beginning of the butterfly block + * - len: Index difference between coefficients subject to a butterfly + * - bound: Ghost variable describing coefficient bound: Prior to `start`, + * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, + * they must be bound by `bound`. + * When this function returns, output coefficients in the index range + * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. + * Example: + * - start=8, len=4 + * This would compute the following four butterflies + * 8 -- 12 + * 9 -- 13 + * 10 -- 14 + * 11 -- 15 + * - start=4, len=2 + * This would compute the following two butterflies + * 4 -- 6 + * 5 -- 7 + */ +static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, + unsigned start, unsigned len, int bound) +__contract__( + requires(start < MLKEM_N) + requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) + requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) + requires(-HALF_Q < zeta && zeta < HALF_Q) + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) + requires(array_abs_bound(r, start, MLKEM_N, bound)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) + ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) +{ + /* `bound` is a ghost variable only needed in the CBMC specification */ + unsigned j; + ((void)bound); + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + /* + * Coefficients are updated in strided pairs, so the bounds for the + * intermediate states alternate twice between the old and new bound + */ + invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j, start + len, bound)) + invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) + { + int16_t t; + t = fqmul(r[j + len], zeta); + r[j + len] = r[j] - t; + r[j] = r[j] + t; + } +} + +/* + *Compute one layer of forward NTT + * Parameters: + * - r: Pointer to base of polynomial + * - len: Stride of butterflies in this layer. + * - layer: Ghost variable indicating which layer is being applied. + * Must match `len` via `len == MLKEM_N >> layer`. + * Note: `len` could be dropped and computed in the function, but + * we are following the structure of the reference NTT from the + * official Kyber implementation here, merely adding `layer` as + * a ghost variable for the specifications. + */ +static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) + requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable only needed in the CBMC specification */ + ((void)layer); + /* Twiddle factors for layer n start at index 2^(layer-1) */ + k = MLKEM_N / (2 * len); + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(start < MLKEM_N + 2 * len) + invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) + invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) + invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) + { + int16_t zeta = zetas[k++]; + ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); + } +} + +/* + * Compute full forward NTT + * NOTE: This particular implementation satisfies a much tighter + * bound on the output coefficients (5*q) than the contractual one (8*q), + * but this is not needed in the calling code. Should we change the + * base multiplication strategy to require smaller NTT output bounds, + * the proof may need strengthening. + */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + unsigned len, layer; + int16_t *r; + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + r = p->coeffs; + + for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) + __loop__( + invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) + invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) + { + ntt_layer(r, len, layer); + } + + /* Check the stronger bound */ + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_NTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + ntt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_NTT */ + +#if !defined(MLKEM_USE_NATIVE_INTT) + +/* Compute one layer of inverse NTT */ +static void invntt_layer(int16_t *r, unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) + requires(len == (1 << (8 - layer))) + requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable used only in the specification */ + ((void)layer); + k = MLKEM_N / len - 1; + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + invariant(start <= MLKEM_N && k <= 127) + /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ + invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) + { + unsigned j; + int16_t zeta = zetas[k--]; + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + invariant(start <= MLKEM_N && k <= 127) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + int16_t t = r[j]; + r[j] = barrett_reduce(t + r[j + len]); + r[j + len] = r[j + len] - t; + r[j + len] = fqmul(r[j + len], zeta); + } + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + /* + * Scale input polynomial to account for Montgomery factor + * and NTT twist. This also brings coefficients down to + * absolute value < MLKEM_Q. + */ + unsigned j, len, layer; + const int16_t f = 1441; + int16_t *r = p->coeffs; + + for (j = 0; j < MLKEM_N; j++) + __loop__( + invariant(j <= MLKEM_N) + invariant(array_abs_bound(r, 0, j, MLKEM_Q))) + { + r[j] = fqmul(r[j], f); + } + + /* Run the invNTT layers */ + for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) + __loop__( + invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + invntt_layer(p->coeffs, len, layer); + } + + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_INTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + intt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_INTT */ + #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.h index 6a14c785d..cb0d67c1a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.h @@ -9,7 +9,7 @@ #include #include "cbmc.h" #include "common.h" -#include "reduce.h" +#include "debug.h" #include "verify.h" /* Absolute exclusive upper bound for the output of the inverse NTT */ @@ -18,6 +18,9 @@ /* Absolute exclusive upper bound for the output of the forward NTT */ #define NTT_BOUND (8 * MLKEM_Q) +#define zetas MLKEM_NAMESPACE(zetas) +extern const int16_t zetas[128]; + /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1] @@ -38,520 +41,6 @@ typedef struct int16_t coeffs[MLKEM_N >> 1]; } poly_mulcache; -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) -#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) -#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) -#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) -#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) -#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) -#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) -#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) -#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) -#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) -/* End of static namespacing */ - -/************************************************************ - * Name: scalar_compress_d1 - * - * Description: Computes round(u * 2 / q) - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 1. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d1(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 2) - ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) -{ - uint32_t d0 = u << 1; - d0 *= 645083; - d0 += 1u << 30; - d0 >>= 31; - return d0; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_compress_d4 - * - * Description: Computes round(u * 16 / q) % 16 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d4(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 16) - ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) -{ - uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ - return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d4 - * - * Description: Computes round(u * q / 16) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d4(uint32_t u) -__contract__( - requires(0 <= u && u < 16) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 8) / 16; } - -/************************************************************ - * Name: scalar_compress_d5 - * - * Description: Computes round(u * 32 / q) % 32 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d5(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 32) - ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) -{ - uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ - return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d5 - * - * Description: Computes round(u * q / 32) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo 32 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d5(uint32_t u) -__contract__( - requires(0 <= u && u < 32) - ensures(return_value <= MLKEM_Q - 1) -) { return ((u * MLKEM_Q) + 16) / 32; } - -/************************************************************ - * Name: scalar_compress_d10 - * - * Description: Computes round(u * 2**10 / q) % 2**10 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d10(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 10)) - ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) -{ - uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x3FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d10 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d10(uint32_t u) -__contract__( - requires(0 <= u && u < 1024) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 512) / 1024; } - -/************************************************************ - * Name: scalar_compress_d11 - * - * Description: Computes round(u * 2**11 / q) % 2**11 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 11. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d11(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 11)) - ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) -{ - uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x7FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d11 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d11(uint32_t u) -__contract__( - requires(0 <= u && u < 2048) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 1024) / 2048; } - -/************************************************************ - * Name: scalar_signed_to_unsigned_q - * - * Description: converts signed polynomial coefficient - * from signed (-3328 .. 3328) form to - * unsigned form (0 .. 3328). - * - * Note: Cryptographic constant time implementation - * - * Examples: 0 -> 0 - * 1 -> 1 - * 3328 -> 3328 - * -1 -> 3328 - * -2 -> 3327 - * -3328 -> 1 - * - * Arguments: c: signed coefficient to be converted - ************************************************************/ -static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) -__contract__( - requires(c > -MLKEM_Q && c < MLKEM_Q) - ensures(return_value >= 0 && return_value < MLKEM_Q) - ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) -{ - debug_assert_abs_bound(&c, 1, MLKEM_Q); - - /* Add Q if c is negative, but in constant time */ - c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); - - /* and therefore cast to uint16_t is safe. */ - debug_assert_bound(&c, 1, 0, MLKEM_Q); - return (uint16_t)c; -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ - (MLKEM_K == 2 || MLKEM_K == 3) -#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) -/************************************************* - * Name: poly_compress_d4 - * - * Description: Compression (4 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); - -#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) -/************************************************* - * Name: poly_compress_d10 - * - * Description: Compression (10 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); - -#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) -/************************************************* - * Name: poly_decompress_d4 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); - -#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) -/************************************************* - * Name: poly_decompress_d10 - * - * Description: De-serialization and subsequent decompression (10 bits) of a - * polynomial; approximate inverse of poly_compress_d10 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) -/************************************************* - * Name: poly_compress_d5 - * - * Description: Compression (5 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); - -#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) -/************************************************* - * Name: poly_compress_d11 - * - * Description: Compression (11 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); - -#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) -/************************************************* - * Name: poly_decompress_d5 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); - -#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) -/************************************************* - * Name: poly_decompress_d11 - * - * Description: De-serialization and subsequent decompression (11 bits) of a - * polynomial; approximate inverse of poly_compress_d11 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ - */ - -#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) -/************************************************* - * Name: poly_tobytes - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - a: const pointer to input polynomial, - * with each coefficient in the range [0,1,..,Q-1] - * OUTPUT - * - r: pointer to output byte array - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYBYTES)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(r)) -); - - -#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) -/************************************************* - * Name: poly_frombytes - * - * Description: De-serialization of a polynomial. - * - * Arguments: INPUT - * - a: pointer to input byte array - * (of MLKEM_POLYBYTES bytes) - * OUTPUT - * - r: pointer to output polynomial, with - * each coefficient unsigned and in the range - * 0 .. 4095 - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) -); - - -#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) -/************************************************* - * Name: poly_frommsg - * - * Description: Convert 32-byte message to polynomial - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *msg: pointer to input message - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(object_whole(r)) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) -); - -#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) -/************************************************* - * Name: poly_tomsg - * - * Description: Convert polynomial to 32-byte message - * - * Arguments: - uint8_t *msg: pointer to output message - * - const poly *r: pointer to input polynomial - * Coefficients must be unsigned canonical - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(msg)) -); - #define poly_basemul_montgomery_cached \ MLKEM_NAMESPACE(poly_basemul_montgomery_cached) /************************************************* @@ -715,4 +204,56 @@ __contract__( assigns(object_whole(r)) ); +#define poly_ntt MLKEM_NAMESPACE(poly_ntt) +/************************************************* + * Name: poly_ntt + * + * Description: Computes negacyclic number-theoretic transform (NTT) of + * a polynomial in place. + * + * The input is assumed to be in normal order and + * coefficient-wise bound by MLKEM_Q in absolute value. + * + * The output polynomial is in bitreversed order, and + * coefficient-wise bound by NTT_BOUND in absolute value. + * + * (NOTE: Sometimes the input to the NTT is actually smaller, + * which gives better bounds.) + * + * Arguments: - poly *p: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) +); + +#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) +/************************************************* + * Name: poly_invntt_tomont + * + * Description: Computes inverse of negacyclic number-theoretic transform (NTT) + * of a polynomial in place; + * inputs assumed to be in bitreversed order, output in normal + * order + * + * The input is assumed to be in bitreversed order, and can + * have arbitrary coefficients in int16_t. + * + * The output polynomial is in normal order, and + * coefficient-wise bound by INVNTT_BOUND in absolute value. + * + * Arguments: - uint16_t *a: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) +); + #endif /* POLY_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly_k.c new file mode 100644 index 000000000..c2d330ea9 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly_k.c @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "poly_k.h" +#include +#include +#include "arith_backend.h" +#include "compress.h" +#include "sampling.h" +#include "symmetric.h" + +#include "debug.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1) +#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2) +/* End of static namespacing */ + +MLKEM_NATIVE_INTERNAL_API +void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], + const polyvec *a) +{ + unsigned i; + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_K; i++) + { + poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_decompress_du(polyvec *r, + const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU); + } + + debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) +{ + unsigned i; + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_K; i++) + { + poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES); + } + + debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_ntt(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_ntt(&r->vec[i]); + } + + debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_invntt_tomont(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_invntt_tomont(&r->vec[i]); + } + + debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND); +} + +#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, + const polyvec *b, + const polyvec_mulcache *b_cache) +{ + unsigned i; + poly t; + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); + + poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]); + for (i = 1; i < MLKEM_K; i++) + { + poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i], + &b_cache->vec[i]); + poly_add(r, &t); + } + + /* + * This bound is true for the C implementation, but not needed + * in the higher level bounds reasoning. It is thus omitted + * them from the spec to not unnecessarily constrain native + * implementations, but checked here nonetheless. + */ + debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q); +} +#else /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, + const polyvec *b, + const polyvec_mulcache *b_cache) +{ + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); + /* Omitting bounds assertion for cache since native implementations may + * decide not to use a mulcache. Note that the C backend implementation + * of poly_basemul_montgomery_cached() does still include the check. */ + polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a, + (const int16_t *)b, + (const int16_t *)b_cache); +} +#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ + +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) +{ + polyvec_mulcache b_cache; + polyvec_mulcache_compute(&b_cache, b); + polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_mulcache_compute(&x->vec[i], &a->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_reduce(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_reduce(&r->vec[i]); + } + + debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_add(polyvec *r, const polyvec *b) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_add(&r->vec[i], &b->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_tomont(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_tomont(&r->vec[i]); + } + + debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q); +} + + +/************************************************* + * Name: poly_cbd_eta1 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter MLKEM_ETA1. + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +static INLINE void poly_cbd_eta1(poly *r, + const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4]) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1)) +) +{ +#if MLKEM_ETA1 == 2 + poly_cbd2(r, buf); +#elif MLKEM_ETA1 == 3 + poly_cbd3(r, buf); +#else +#error "Invalid value of MLKEM_ETA1" +#endif +} + +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, + uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) +{ + ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1]; + ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1]; + ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1]; + ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1]; + memcpy(extkey0, seed, MLKEM_SYMBYTES); + memcpy(extkey1, seed, MLKEM_SYMBYTES); + memcpy(extkey2, seed, MLKEM_SYMBYTES); + memcpy(extkey3, seed, MLKEM_SYMBYTES); + extkey0[MLKEM_SYMBYTES] = nonce0; + extkey1[MLKEM_SYMBYTES] = nonce1; + extkey2[MLKEM_SYMBYTES] = nonce2; + extkey3[MLKEM_SYMBYTES] = nonce3; + prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3); + poly_cbd_eta1(r0, buf0); + poly_cbd_eta1(r1, buf1); + poly_cbd_eta1(r2, buf2); + poly_cbd_eta1(r3, buf3); + + debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1); +} + +#if MLKEM_K == 2 || MLKEM_K == 4 +/************************************************* + * Name: poly_cbd_eta2 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter MLKEM_ETA2. + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +static INLINE void poly_cbd_eta2(poly *r, + const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))) +{ +#if MLKEM_ETA2 == 2 + poly_cbd2(r, buf); +#else +#error "Invalid value of MLKEM_ETA2" +#endif +} + +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce) +{ + ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]; + ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1]; + + memcpy(extkey, seed, MLKEM_SYMBYTES); + extkey[MLKEM_SYMBYTES] = nonce; + prf_eta2(buf, extkey); + + poly_cbd_eta2(r, buf); + + debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1); +} +#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ + + +#if MLKEM_K == 2 +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, + uint8_t nonce3) +{ + ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4]; + ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1]; + memcpy(extkey[0], seed, MLKEM_SYMBYTES); + memcpy(extkey[1], seed, MLKEM_SYMBYTES); + memcpy(extkey[2], seed, MLKEM_SYMBYTES); + memcpy(extkey[3], seed, MLKEM_SYMBYTES); + extkey[0][MLKEM_SYMBYTES] = nonce0; + extkey[1][MLKEM_SYMBYTES] = nonce1; + extkey[2][MLKEM_SYMBYTES] = nonce2; + extkey[3][MLKEM_SYMBYTES] = nonce3; + + prf_eta1(buf1[0], extkey[0]); + prf_eta1(buf1[1], extkey[1]); + prf_eta2(buf2[0], extkey[2]); + prf_eta2(buf2[1], extkey[3]); + + poly_cbd_eta1(r0, buf1[0]); + poly_cbd_eta1(r1, buf1[1]); + poly_cbd_eta2(r2, buf2[0]); + poly_cbd_eta2(r3, buf2[1]); + + debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1); + debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1); +} +#endif /* MLKEM_K == 2 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly_k.h new file mode 100644 index 000000000..0aea95912 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly_k.h @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef POLY_K_H +#define POLY_K_H + +#include +#include "common.h" +#include "compress.h" +#include "poly.h" + +#define polyvec MLKEM_NAMESPACE_K(polyvec) +typedef struct +{ + poly vec[MLKEM_K]; +} ALIGN polyvec; + +#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache) +typedef struct +{ + poly_mulcache vec[MLKEM_K]; +} polyvec_mulcache; + +#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du) +/************************************************* + * Name: poly_compress_du + * + * Description: Compression (du bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], + const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))) +{ +#if MLKEM_DU == 10 + poly_compress_d10(r, a); +#elif MLKEM_DU == 11 + poly_compress_d11(r, a); +#else +#error "Invalid value of MLKEM_DU" +#endif +} + +#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du) +/************************************************* + * Name: poly_decompress_du + * + * Description: De-serialization and subsequent decompression (du bits) of a + * polynomial; approximate inverse of poly_compress_du + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +static INLINE void poly_decompress_du( + poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +{ +#if MLKEM_DU == 10 + poly_decompress_d10(r, a); +#elif MLKEM_DU == 11 + poly_decompress_d11(r, a); +#else +#error "Invalid value of MLKEM_DU" +#endif +} + +#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv) +/************************************************* + * Name: poly_compress_dv + * + * Description: Compression (dv bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], + const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(r))) +{ +#if MLKEM_DV == 4 + poly_compress_d4(r, a); +#elif MLKEM_DV == 5 + poly_compress_d5(r, a); +#else +#error "Invalid value of MLKEM_DV" +#endif +} + + +#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv) +/************************************************* + * Name: poly_decompress_dv + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +static INLINE void poly_decompress_dv( + poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(object_whole(r)) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +{ +#if MLKEM_DV == 4 + poly_decompress_d4(r, a); +#elif MLKEM_DV == 5 + poly_decompress_d5(r, a); +#else +#error "Invalid value of MLKEM_DV" +#endif +} + +#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du) +/************************************************* + * Name: polyvec_compress_du + * + * Description: Compress and serialize vector of polynomials + * + * Arguments: - uint8_t *r: pointer to output byte array + * (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU) + * - const polyvec *a: pointer to input vector of polynomials. + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], + const polyvec *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(a, sizeof(polyvec))) + requires(forall(k0, 0, MLKEM_K, + array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) + assigns(object_whole(r)) +); + +#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du) +/************************************************* + * Name: polyvec_decompress_du + * + * Description: De-serialize and decompress vector of polynomials; + * approximate inverse of polyvec_compress_du + * + * Arguments: - polyvec *r: pointer to output vector of polynomials. + * Output will have coefficients normalized to [0,..,q-1]. + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU) + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_decompress_du(polyvec *r, + const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(k0, 0, MLKEM_K, + array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +); + +#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes) +/************************************************* + * Name: polyvec_tobytes + * + * Description: Serialize vector of polynomials + * + * Arguments: - uint8_t *r: pointer to output byte array + * (needs space for MLKEM_POLYVECBYTES) + * - const polyvec *a: pointer to input vector of polynomials + * Each polynomial must have coefficients in [0,..,q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) +__contract__( + requires(memory_no_alias(a, sizeof(polyvec))) + requires(memory_no_alias(r, MLKEM_POLYVECBYTES)) + requires(forall(k0, 0, MLKEM_K, + array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) + assigns(object_whole(r)) +); + +#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes) +/************************************************* + * Name: polyvec_frombytes + * + * Description: De-serialize vector of polynomials; + * inverse of polyvec_tobytes + * + * Arguments: - const polyvec *a: pointer to output vector of polynomials + * (of length MLKEM_POLYVECBYTES). Output will have coefficients + * normalized in [0..4095]. + * - uint8_t *r: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + requires(memory_no_alias(a, MLKEM_POLYVECBYTES)) + assigns(object_whole(r)) + ensures(forall(k0, 0, MLKEM_K, + array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) +); + +#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt) +/************************************************* + * Name: polyvec_ntt + * + * Description: Apply forward NTT to all elements of a vector of polynomials. + * + * The input is assumed to be in normal order and + * coefficient-wise bound by MLKEM_Q in absolute value. + * + * The output polynomial is in bitreversed order, and + * coefficient-wise bound by NTT_BOUND in absolute value. + * + * Arguments: - polyvec *r: pointer to in/output vector of polynomials + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_ntt(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + requires(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) + assigns(object_whole(r)) + ensures(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND))) +); + +#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont) +/************************************************* + * Name: polyvec_invntt_tomont + * + * Description: Apply inverse NTT to all elements of a vector of polynomials + * and multiply by Montgomery factor 2^16 + * + * The input is assumed to be in bitreversed order, and can + * have arbitrary coefficients in int16_t. + * + * The output polynomial is in normal order, and + * coefficient-wise bound by INVNTT_BOUND in absolute value. + * + * + * Arguments: - polyvec *r: pointer to in/output vector of polynomials + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_invntt_tomont(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND))) +); + +#define polyvec_basemul_acc_montgomery \ + MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery) +/************************************************* + * Name: polyvec_basemul_acc_montgomery + * + * Description: Multiply elements of a and b in NTT domain, accumulate into r, + * and multiply by 2^-16. + * + * Arguments: - poly *r: pointer to output polynomial + * - const polyvec *a: pointer to first input vector of polynomials + * - const polyvec *b: pointer to second input vector of polynomials + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(a, sizeof(polyvec))) + requires(memory_no_alias(b, sizeof(polyvec))) + requires(forall(k1, 0, MLKEM_K, + array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) + assigns(memory_slice(r, sizeof(poly))) +); + + +#define polyvec_basemul_acc_montgomery_cached \ + MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached) +/************************************************* + * Name: polyvec_basemul_acc_montgomery_cached + * + * Description: Scalar product of two vectors of polynomials in NTT domain, + * using mulcache for second operand. + * + * Bounds: + * - Every coefficient of a is assumed to be in [0..4095] + * - No bounds guarantees for the coefficients in the result. + * + * Arguments: - poly *r: pointer to output polynomial + * - const polyvec *a: pointer to first input polynomial vector + * - const polyvec *b: pointer to second input polynomial vector + * - const polyvec_mulcache *b_cache: pointer to mulcache + * for second input polynomial vector. Can be computed + * via polyvec_mulcache_compute(). + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, + const polyvec *b, + const polyvec_mulcache *b_cache) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(a, sizeof(polyvec))) + requires(memory_no_alias(b, sizeof(polyvec))) + requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache))) + requires(forall(k1, 0, MLKEM_K, + array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) + assigns(memory_slice(r, sizeof(poly))) +); + +#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute) +/************************************************************ + * Name: polyvec_mulcache_compute + * + * Description: Computes the mulcache for a vector of polynomials in NTT domain + * + * The mulcache of a degree-2 polynomial b := b0 + b1*X + * in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when + * computing products of b in Fq[X]/(X^2-zeta). + * + * The mulcache of a polynomial in NTT domain -- which is + * a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta), + * for varying zeta, is the 128-tuple of mulcaches of those + * polynomials. + * + * The mulcache of a vector of polynomials is the vector + * of mulcaches of its entries. + * + * Arguments: - x: Pointer to mulcache to be populated + * - a: Pointer to input polynomial vector + ************************************************************/ +/* + * NOTE: The default C implementation of this function populates + * the mulcache with values in (-q,q), but this is not needed for the + * higher level safety proofs, and thus not part of the spec. + */ +MLKEM_NATIVE_INTERNAL_API +void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) +__contract__( + requires(memory_no_alias(x, sizeof(polyvec_mulcache))) + requires(memory_no_alias(a, sizeof(polyvec))) + assigns(object_whole(x)) +); + +#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce) +/************************************************* + * Name: polyvec_reduce + * + * Description: Applies Barrett reduction to each coefficient + * of each element of a vector of polynomials; + * for details of the Barrett reduction see comments in reduce.c + * + * Arguments: - polyvec *r: pointer to input/output polynomial + **************************************************/ +/* + * NOTE: The semantics of polyvec_reduce() is different in + * the reference implementation, which requires + * signed canonical output data. Unsigned canonical + * outputs are better suited to the only remaining + * use of poly_reduce() in the context of (de)serialization. + */ +MLKEM_NATIVE_INTERNAL_API +void polyvec_reduce(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(k0, 0, MLKEM_K, + array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +); + +#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add) +/************************************************* + * Name: polyvec_add + * + * Description: Add vectors of polynomials + * + * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be + * added to + * - const polyvec *b: pointer to second input vector of polynomials + * + * The coefficients of r and b must be so that the addition does + * not overflow. Otherwise, the behaviour of this function is undefined. + * + * The coefficients returned in *r are in int16_t which is sufficient + * to prove type-safety of calling units. Therefore, no stronger + * ensures clause is required on this function. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_add(polyvec *r, const polyvec *b) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + requires(memory_no_alias(b, sizeof(polyvec))) + requires(forall(j0, 0, MLKEM_K, + forall(k0, 0, MLKEM_N, + (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX))) + requires(forall(j1, 0, MLKEM_K, + forall(k1, 0, MLKEM_N, + (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN))) + assigns(object_whole(r)) +); + +#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont) +/************************************************* + * Name: polyvec_tomont + * + * Description: Inplace conversion of all coefficients of a polynomial + * vector from normal domain to Montgomery domain + * + * Bounds: Output < q in absolute value. + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_tomont(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(memory_slice(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) +); + +#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x) +/************************************************* + * Name: poly_getnoise_eta1_4x + * + * Description: Batch sample four polynomials deterministically from a seed + * and nonces, with output polynomials close to centered binomial distribution + * with parameter MLKEM_ETA1. + * + * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial + * - const uint8_t *seed: pointer to input seed + * (of length MLKEM_SYMBYTES bytes) + * - uint8_t nonce{0,1,2,3}: one-byte input nonce + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, + uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) +/* Depending on MLKEM_K, the pointers passed to this function belong + to the same objects, so we cannot use memory_no_alias for r0-r3. + + NOTE: Somehow it is important to use memory_no_alias() first in the + conjunctions defining each case. +*/ +#if MLKEM_K == 2 +__contract__( + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */ + (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && + r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) + assigns(memory_slice(r0, sizeof(poly))) + assigns(memory_slice(r1, sizeof(poly))) + assigns(memory_slice(r2, sizeof(poly))) + assigns(memory_slice(r3, sizeof(poly))) + ensures( + array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); +); +#elif MLKEM_K == 4 +__contract__( + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + requires( /* Case B: r0, r1, r2, r3 consecutive */ + (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3)) + assigns(memory_slice(r0, sizeof(poly))) + assigns(memory_slice(r1, sizeof(poly))) + assigns(memory_slice(r2, sizeof(poly))) + assigns(memory_slice(r3, sizeof(poly))) + ensures( + array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); +); +#elif MLKEM_K == 3 +__contract__( + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + requires( /* Case C: r0, r1, r2 consecutive */ + (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) && + r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0))) + assigns(memory_slice(r0, sizeof(poly))) + assigns(memory_slice(r1, sizeof(poly))) + assigns(memory_slice(r2, sizeof(poly))) + assigns(memory_slice(r3, sizeof(poly))) + ensures( + array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); +); +#endif /* MLKEM_K */ + +#if MLKEM_ETA1 == MLKEM_ETA2 +/* + * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024 + * where MLKEM_ETA2 = MLKEM_ETA1 = 2. + * For ml-kem-512, poly_getnoise_eta1122_4x is used instead. + */ +#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x +#endif /* MLKEM_ETA1 == MLKEM_ETA2 */ + +#if MLKEM_K == 2 || MLKEM_K == 4 +#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2) +/************************************************* + * Name: poly_getnoise_eta2 + * + * Description: Sample a polynomial deterministically from a seed and a nonce, + * with output polynomial close to centered binomial distribution + * with parameter MLKEM_ETA2 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *seed: pointer to input seed + * (of length MLKEM_SYMBYTES bytes) + * - uint8_t nonce: one-byte input nonce + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + assigns(object_whole(r)) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)) +); +#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ + +#if MLKEM_K == 2 +#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x) +/************************************************* + * Name: poly_getnoise_eta1122_4x + * + * Description: Batch sample four polynomials deterministically from a seed + * and a nonces, with output polynomials close to centered binomial + * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2 + * + * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial + * - const uint8_t *seed: pointer to input seed + * (of length MLKEM_SYMBYTES bytes) + * - uint8_t nonce{0,1,2,3}: one-byte input nonce + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, + uint8_t nonce3) +__contract__( + requires( /* r0, r1 consecutive, r2, r3 consecutive */ + (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && + r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3)) + ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)); +); +#endif /* MLKEM_K == 2 */ + +#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.c deleted file mode 100644 index 50ea1c34a..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.c +++ /dev/null @@ -1,330 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "polyvec.h" -#include -#include -#include "arith_backend.h" -#include "cbd.h" -#include "ntt.h" -#include "poly.h" -#include "symmetric.h" - -#include "debug.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1) -#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2) -/* End of static namespacing */ - -MLKEM_NATIVE_INTERNAL_API -void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], - const polyvec *a) -{ - unsigned i; - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_K; i++) - { - poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_decompress_du(polyvec *r, - const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU); - } - - debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) -{ - unsigned i; - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_K; i++) - { - poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES); - } - - debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_ntt(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_ntt(&r->vec[i]); - } - - debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_invntt_tomont(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_invntt_tomont(&r->vec[i]); - } - - debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND); -} - -#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) -{ - unsigned i; - poly t; - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); - - poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]); - for (i = 1; i < MLKEM_K; i++) - { - poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i], - &b_cache->vec[i]); - poly_add(r, &t); - } - - /* - * This bound is true for the C implementation, but not needed - * in the higher level bounds reasoning. It is thus omitted - * them from the spec to not unnecessarily constrain native - * implementations, but checked here nonetheless. - */ - debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q); -} -#else /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) -{ - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); - /* Omitting bounds assertion for cache since native implementations may - * decide not to use a mulcache. Note that the C backend implementation - * of poly_basemul_montgomery_cached() does still include the check. */ - polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache); -} -#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ - -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) -{ - polyvec_mulcache b_cache; - polyvec_mulcache_compute(&b_cache, b); - polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_mulcache_compute(&x->vec[i], &a->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_reduce(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_reduce(&r->vec[i]); - } - - debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_add(polyvec *r, const polyvec *b) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_add(&r->vec[i], &b->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_tomont(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_tomont(&r->vec[i]); - } - - debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q); -} - - -/************************************************* - * Name: poly_cbd_eta1 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter MLKEM_ETA1. - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -static INLINE void poly_cbd_eta1(poly *r, - const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4]) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1)) -) -{ -#if MLKEM_ETA1 == 2 - poly_cbd2(r, buf); -#elif MLKEM_ETA1 == 3 - poly_cbd3(r, buf); -#else -#error "Invalid value of MLKEM_ETA1" -#endif -} - -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, - uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) -{ - ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1]; - ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1]; - ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1]; - ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1]; - memcpy(extkey0, seed, MLKEM_SYMBYTES); - memcpy(extkey1, seed, MLKEM_SYMBYTES); - memcpy(extkey2, seed, MLKEM_SYMBYTES); - memcpy(extkey3, seed, MLKEM_SYMBYTES); - extkey0[MLKEM_SYMBYTES] = nonce0; - extkey1[MLKEM_SYMBYTES] = nonce1; - extkey2[MLKEM_SYMBYTES] = nonce2; - extkey3[MLKEM_SYMBYTES] = nonce3; - prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3); - poly_cbd_eta1(r0, buf0); - poly_cbd_eta1(r1, buf1); - poly_cbd_eta1(r2, buf2); - poly_cbd_eta1(r3, buf3); - - debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1); -} - -#if MLKEM_K == 2 || MLKEM_K == 4 -/************************************************* - * Name: poly_cbd_eta2 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter MLKEM_ETA2. - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -static INLINE void poly_cbd_eta2(poly *r, - const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))) -{ -#if MLKEM_ETA2 == 2 - poly_cbd2(r, buf); -#else -#error "Invalid value of MLKEM_ETA2" -#endif -} - -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce) -{ - ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]; - ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1]; - - memcpy(extkey, seed, MLKEM_SYMBYTES); - extkey[MLKEM_SYMBYTES] = nonce; - prf_eta2(buf, extkey); - - poly_cbd_eta2(r, buf); - - debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1); -} -#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ - - -#if MLKEM_K == 2 -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, - uint8_t nonce3) -{ - ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4]; - ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1]; - memcpy(extkey[0], seed, MLKEM_SYMBYTES); - memcpy(extkey[1], seed, MLKEM_SYMBYTES); - memcpy(extkey[2], seed, MLKEM_SYMBYTES); - memcpy(extkey[3], seed, MLKEM_SYMBYTES); - extkey[0][MLKEM_SYMBYTES] = nonce0; - extkey[1][MLKEM_SYMBYTES] = nonce1; - extkey[2][MLKEM_SYMBYTES] = nonce2; - extkey[3][MLKEM_SYMBYTES] = nonce3; - - prf_eta1(buf1[0], extkey[0]); - prf_eta1(buf1[1], extkey[1]); - prf_eta2(buf2[0], extkey[2]); - prf_eta2(buf2[1], extkey[3]); - - poly_cbd_eta1(r0, buf1[0]); - poly_cbd_eta1(r1, buf1[1]); - poly_cbd_eta2(r2, buf2[0]); - poly_cbd_eta2(r3, buf2[1]); - - debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1); - debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1); -} -#endif /* MLKEM_K == 2 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.h deleted file mode 100644 index 8be8579e0..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.h +++ /dev/null @@ -1,595 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef POLYVEC_H -#define POLYVEC_H - -#include -#include "common.h" -#include "poly.h" - -#define polyvec MLKEM_NAMESPACE_K(polyvec) -typedef struct -{ - poly vec[MLKEM_K]; -} ALIGN polyvec; - -#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache) -typedef struct -{ - poly_mulcache vec[MLKEM_K]; -} polyvec_mulcache; - -#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du) -/************************************************* - * Name: poly_compress_du - * - * Description: Compression (du bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], - const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))) -{ -#if MLKEM_DU == 10 - poly_compress_d10(r, a); -#elif MLKEM_DU == 11 - poly_compress_d11(r, a); -#else -#error "Invalid value of MLKEM_DU" -#endif -} - -#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du) -/************************************************* - * Name: poly_decompress_du - * - * Description: De-serialization and subsequent decompression (du bits) of a - * polynomial; approximate inverse of poly_compress_du - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -static INLINE void poly_decompress_du( - poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -{ -#if MLKEM_DU == 10 - poly_decompress_d10(r, a); -#elif MLKEM_DU == 11 - poly_decompress_d11(r, a); -#else -#error "Invalid value of MLKEM_DU" -#endif -} - -#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv) -/************************************************* - * Name: poly_compress_dv - * - * Description: Compression (dv bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], - const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(r))) -{ -#if MLKEM_DV == 4 - poly_compress_d4(r, a); -#elif MLKEM_DV == 5 - poly_compress_d5(r, a); -#else -#error "Invalid value of MLKEM_DV" -#endif -} - - -#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv) -/************************************************* - * Name: poly_decompress_dv - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -static INLINE void poly_decompress_dv( - poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(object_whole(r)) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -{ -#if MLKEM_DV == 4 - poly_decompress_d4(r, a); -#elif MLKEM_DV == 5 - poly_decompress_d5(r, a); -#else -#error "Invalid value of MLKEM_DV" -#endif -} - -#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du) -/************************************************* - * Name: polyvec_compress_du - * - * Description: Compress and serialize vector of polynomials - * - * Arguments: - uint8_t *r: pointer to output byte array - * (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU) - * - const polyvec *a: pointer to input vector of polynomials. - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], - const polyvec *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(a, sizeof(polyvec))) - requires(forall(k0, 0, MLKEM_K, - array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) - assigns(object_whole(r)) -); - -#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du) -/************************************************* - * Name: polyvec_decompress_du - * - * Description: De-serialize and decompress vector of polynomials; - * approximate inverse of polyvec_compress_du - * - * Arguments: - polyvec *r: pointer to output vector of polynomials. - * Output will have coefficients normalized to [0,..,q-1]. - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU) - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_decompress_du(polyvec *r, - const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(k0, 0, MLKEM_K, - array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -); - -#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes) -/************************************************* - * Name: polyvec_tobytes - * - * Description: Serialize vector of polynomials - * - * Arguments: - uint8_t *r: pointer to output byte array - * (needs space for MLKEM_POLYVECBYTES) - * - const polyvec *a: pointer to input vector of polynomials - * Each polynomial must have coefficients in [0,..,q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) -__contract__( - requires(memory_no_alias(a, sizeof(polyvec))) - requires(memory_no_alias(r, MLKEM_POLYVECBYTES)) - requires(forall(k0, 0, MLKEM_K, - array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) - assigns(object_whole(r)) -); - -#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes) -/************************************************* - * Name: polyvec_frombytes - * - * Description: De-serialize vector of polynomials; - * inverse of polyvec_tobytes - * - * Arguments: - const polyvec *a: pointer to output vector of polynomials - * (of length MLKEM_POLYVECBYTES). Output will have coefficients - * normalized in [0..4095]. - * - uint8_t *r: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - requires(memory_no_alias(a, MLKEM_POLYVECBYTES)) - assigns(object_whole(r)) - ensures(forall(k0, 0, MLKEM_K, - array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) -); - -#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt) -/************************************************* - * Name: polyvec_ntt - * - * Description: Apply forward NTT to all elements of a vector of polynomials. - * - * The input is assumed to be in normal order and - * coefficient-wise bound by MLKEM_Q in absolute value. - * - * The output polynomial is in bitreversed order, and - * coefficient-wise bound by NTT_BOUND in absolute value. - * - * Arguments: - polyvec *r: pointer to in/output vector of polynomials - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_ntt(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - requires(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) - assigns(object_whole(r)) - ensures(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND))) -); - -#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont) -/************************************************* - * Name: polyvec_invntt_tomont - * - * Description: Apply inverse NTT to all elements of a vector of polynomials - * and multiply by Montgomery factor 2^16 - * - * The input is assumed to be in bitreversed order, and can - * have arbitrary coefficients in int16_t. - * - * The output polynomial is in normal order, and - * coefficient-wise bound by INVNTT_BOUND in absolute value. - * - * - * Arguments: - polyvec *r: pointer to in/output vector of polynomials - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_invntt_tomont(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND))) -); - -#define polyvec_basemul_acc_montgomery \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery) -/************************************************* - * Name: polyvec_basemul_acc_montgomery - * - * Description: Multiply elements of a and b in NTT domain, accumulate into r, - * and multiply by 2^-16. - * - * Arguments: - poly *r: pointer to output polynomial - * - const polyvec *a: pointer to first input vector of polynomials - * - const polyvec *b: pointer to second input vector of polynomials - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(a, sizeof(polyvec))) - requires(memory_no_alias(b, sizeof(polyvec))) - requires(forall(k1, 0, MLKEM_K, - array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) - assigns(memory_slice(r, sizeof(poly))) -); - - -#define polyvec_basemul_acc_montgomery_cached \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached) -/************************************************* - * Name: polyvec_basemul_acc_montgomery_cached - * - * Description: Scalar product of two vectors of polynomials in NTT domain, - * using mulcache for second operand. - * - * Bounds: - * - Every coefficient of a is assumed to be in [0..4095] - * - No bounds guarantees for the coefficients in the result. - * - * Arguments: - poly *r: pointer to output polynomial - * - const polyvec *a: pointer to first input polynomial vector - * - const polyvec *b: pointer to second input polynomial vector - * - const polyvec_mulcache *b_cache: pointer to mulcache - * for second input polynomial vector. Can be computed - * via polyvec_mulcache_compute(). - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(a, sizeof(polyvec))) - requires(memory_no_alias(b, sizeof(polyvec))) - requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache))) - requires(forall(k1, 0, MLKEM_K, - array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) - assigns(memory_slice(r, sizeof(poly))) -); - -#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute) -/************************************************************ - * Name: polyvec_mulcache_compute - * - * Description: Computes the mulcache for a vector of polynomials in NTT domain - * - * The mulcache of a degree-2 polynomial b := b0 + b1*X - * in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when - * computing products of b in Fq[X]/(X^2-zeta). - * - * The mulcache of a polynomial in NTT domain -- which is - * a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta), - * for varying zeta, is the 128-tuple of mulcaches of those - * polynomials. - * - * The mulcache of a vector of polynomials is the vector - * of mulcaches of its entries. - * - * Arguments: - x: Pointer to mulcache to be populated - * - a: Pointer to input polynomial vector - ************************************************************/ -/* - * NOTE: The default C implementation of this function populates - * the mulcache with values in (-q,q), but this is not needed for the - * higher level safety proofs, and thus not part of the spec. - */ -MLKEM_NATIVE_INTERNAL_API -void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) -__contract__( - requires(memory_no_alias(x, sizeof(polyvec_mulcache))) - requires(memory_no_alias(a, sizeof(polyvec))) - assigns(object_whole(x)) -); - -#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce) -/************************************************* - * Name: polyvec_reduce - * - * Description: Applies Barrett reduction to each coefficient - * of each element of a vector of polynomials; - * for details of the Barrett reduction see comments in reduce.c - * - * Arguments: - polyvec *r: pointer to input/output polynomial - **************************************************/ -/* - * NOTE: The semantics of polyvec_reduce() is different in - * the reference implementation, which requires - * signed canonical output data. Unsigned canonical - * outputs are better suited to the only remaining - * use of poly_reduce() in the context of (de)serialization. - */ -MLKEM_NATIVE_INTERNAL_API -void polyvec_reduce(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(k0, 0, MLKEM_K, - array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -); - -#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add) -/************************************************* - * Name: polyvec_add - * - * Description: Add vectors of polynomials - * - * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be - * added to - * - const polyvec *b: pointer to second input vector of polynomials - * - * The coefficients of r and b must be so that the addition does - * not overflow. Otherwise, the behaviour of this function is undefined. - * - * The coefficients returned in *r are in int16_t which is sufficient - * to prove type-safety of calling units. Therefore, no stronger - * ensures clause is required on this function. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_add(polyvec *r, const polyvec *b) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - requires(memory_no_alias(b, sizeof(polyvec))) - requires(forall(j0, 0, MLKEM_K, - forall(k0, 0, MLKEM_N, - (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX))) - requires(forall(j1, 0, MLKEM_K, - forall(k1, 0, MLKEM_N, - (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN))) - assigns(object_whole(r)) -); - -#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont) -/************************************************* - * Name: polyvec_tomont - * - * Description: Inplace conversion of all coefficients of a polynomial - * vector from normal domain to Montgomery domain - * - * Bounds: Output < q in absolute value. - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_tomont(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(memory_slice(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) -); - -#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x) -/************************************************* - * Name: poly_getnoise_eta1_4x - * - * Description: Batch sample four polynomials deterministically from a seed - * and nonces, with output polynomials close to centered binomial distribution - * with parameter MLKEM_ETA1. - * - * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial - * - const uint8_t *seed: pointer to input seed - * (of length MLKEM_SYMBYTES bytes) - * - uint8_t nonce{0,1,2,3}: one-byte input nonce - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, - uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) -/* Depending on MLKEM_K, the pointers passed to this function belong - to the same objects, so we cannot use memory_no_alias for r0-r3. - - NOTE: Somehow it is important to use memory_no_alias() first in the - conjunctions defining each case. -*/ -#if MLKEM_K == 2 -__contract__( - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */ - (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && - r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) - assigns(memory_slice(r0, sizeof(poly))) - assigns(memory_slice(r1, sizeof(poly))) - assigns(memory_slice(r2, sizeof(poly))) - assigns(memory_slice(r3, sizeof(poly))) - ensures( - array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); -); -#elif MLKEM_K == 4 -__contract__( - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - requires( /* Case B: r0, r1, r2, r3 consecutive */ - (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3)) - assigns(memory_slice(r0, sizeof(poly))) - assigns(memory_slice(r1, sizeof(poly))) - assigns(memory_slice(r2, sizeof(poly))) - assigns(memory_slice(r3, sizeof(poly))) - ensures( - array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); -); -#elif MLKEM_K == 3 -__contract__( - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - requires( /* Case C: r0, r1, r2 consecutive */ - (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) && - r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0))) - assigns(memory_slice(r0, sizeof(poly))) - assigns(memory_slice(r1, sizeof(poly))) - assigns(memory_slice(r2, sizeof(poly))) - assigns(memory_slice(r3, sizeof(poly))) - ensures( - array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); -); -#endif /* MLKEM_K */ - -#if MLKEM_ETA1 == MLKEM_ETA2 -/* - * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024 - * where MLKEM_ETA2 = MLKEM_ETA1 = 2. - * For ml-kem-512, poly_getnoise_eta1122_4x is used instead. - */ -#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x -#endif /* MLKEM_ETA1 == MLKEM_ETA2 */ - -#if MLKEM_K == 2 || MLKEM_K == 4 -#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2) -/************************************************* - * Name: poly_getnoise_eta2 - * - * Description: Sample a polynomial deterministically from a seed and a nonce, - * with output polynomial close to centered binomial distribution - * with parameter MLKEM_ETA2 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *seed: pointer to input seed - * (of length MLKEM_SYMBYTES bytes) - * - uint8_t nonce: one-byte input nonce - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - assigns(object_whole(r)) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)) -); -#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ - -#if MLKEM_K == 2 -#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x) -/************************************************* - * Name: poly_getnoise_eta1122_4x - * - * Description: Batch sample four polynomials deterministically from a seed - * and a nonces, with output polynomials close to centered binomial - * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2 - * - * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial - * - const uint8_t *seed: pointer to input seed - * (of length MLKEM_SYMBYTES bytes) - * - uint8_t nonce{0,1,2,3}: one-byte input nonce - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, - uint8_t nonce3) -__contract__( - requires( /* r0, r1 consecutive, r2, r3 consecutive */ - (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && - r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3)) - ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)); -); -#endif /* MLKEM_K == 2 */ - -#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/reduce.h deleted file mode 100644 index b432a4201..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/reduce.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef REDUCE_H -#define REDUCE_H - -#include -#include "cbmc.h" -#include "common.h" -#include "debug.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) -#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) -#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) -#define fqmul MLKEM_NAMESPACE(fqmul) -#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) -/* End of static namespacing */ - -#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ - -/************************************************* - * Name: cast_uint16_to_int16 - * - * Description: Cast uint16 value to int16 - * - * Returns: - * input x in 0 .. 32767: returns value unchanged - * input x in 32768 .. 65535: returns (x - 65536) - **************************************************/ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "conversion" -#endif -ALWAYS_INLINE -static INLINE int16_t cast_uint16_to_int16(uint16_t x) -{ - /* - * PORTABILITY: This relies on uint16_t -> int16_t - * being implemented as the inverse of int16_t -> uint16_t, - * which is implementation-defined (C99 6.3.1.3 (3)) - * CBMC (correctly) fails to prove this conversion is OK, - * so we have to suppress that check here - */ - return (int16_t)x; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************* - * Name: montgomery_reduce_generic - * - * Description: Generic Montgomery reduction; given a 32-bit integer a, computes - * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 - * - * Arguments: - int32_t a: input integer to be reduced - * - * Returns: integer congruent to a * R^-1 modulo q, with absolute value - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 - * - **************************************************/ -ALWAYS_INLINE -static INLINE int16_t montgomery_reduce_generic(int32_t a) -{ - /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ - const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - - /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ - const uint16_t a_reduced = a & UINT16_MAX; - const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - - /* Lift to signed canonical representative mod 2^16. */ - const int16_t t = cast_uint16_to_int16(a_inverted); - - int32_t r = a - ((int32_t)t * MLKEM_Q); - /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - - /* - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - r = r >> 16; - /* Bounds: |r >> 16| <= ceil(|r| / 2^16) - * <= ceil(|a| / 2^16 + MLKEM_Q / 2) - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * - * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) - */ - - return (int16_t)r; -} - -/************************************************* - * Name: montgomery_reduce - * - * Description: Montgomery reduction - * - * Arguments: - int32_t a: input integer to be reduced - * Must be smaller than 2 * 2^12 * 2^15 in absolute value. - * - * Returns: integer congruent to a * R^-1 modulo q, - * smaller than 2 * q in absolute value. - **************************************************/ -static INLINE int16_t montgomery_reduce(int32_t a) -__contract__( - requires(a > -(2 * UINT12_LIMIT * 32768)) - requires(a < (2 * UINT12_LIMIT * 32768)) - ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); - - res = montgomery_reduce_generic(a); - /* Bounds: - * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 - * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 - * < 2 * MLKEM_Q */ - - debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); - return res; -} - -/************************************************* - * Name: fqmul - * - * Description: Montgomery multiplication modulo q=3329 - * - * Arguments: - int16_t a: first factor - * Can be any int16_t. - * - int16_t b: second factor. - * Must be signed canonical (abs value <(q+1)/2) - * - * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and - * smaller than q in absolute value. - * - **************************************************/ -static INLINE int16_t fqmul(int16_t a, int16_t b) -__contract__( - requires(b > -HALF_Q) - requires(b < HALF_Q) - ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&b, 1, HALF_Q); - - res = montgomery_reduce((int32_t)a * (int32_t)b); - /* Bounds: - * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 - * < MLKEM_Q - */ - - debug_assert_abs_bound(&res, 1, MLKEM_Q); - return res; -} - -/************************************************* - * Name: barrett_reduce - * - * Description: Barrett reduction; given a 16-bit integer a, computes - * centered representative congruent to a mod q in - * {-(q-1)/2,...,(q-1)/2} - * - * Arguments: - int16_t a: input integer to be reduced - * - * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. - **************************************************/ -static INLINE int16_t barrett_reduce(int16_t a) -__contract__( - ensures(return_value > -HALF_Q && return_value < HALF_Q) -) -{ - /* - * To divide by MLKEM_Q using Barrett multiplication, the "magic number" - * multiplier is round_to_nearest(2**26/MLKEM_Q) - */ - const int BPOWER = 26; - const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - - /* - * Compute round_to_nearest(a/MLKEM_Q) using the multiplier - * above and shift by BPOWER places. - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; - - /* - * t is in -10 .. +10, so we need 32-bit math to - * evaluate t * MLKEM_Q and the subsequent subtraction - */ - int16_t res = (int16_t)(a - t * MLKEM_Q); - - debug_assert_abs_bound(&res, 1, HALF_Q); - return res; -} - -#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.c deleted file mode 100644 index cbbe4407f..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.c +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) - -#include "arith_backend.h" -#include "debug.h" -#include "fips202.h" -#include "fips202x4.h" -#include "rej_uniform.h" -#include "symmetric.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define rej_uniform MLKEM_NAMESPACE(rej_uniform) -#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar) -/* End of static namespacing */ - -static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target, - unsigned int offset, const uint8_t *buf, - unsigned int buflen) -__contract__( - requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) - requires(memory_no_alias(r, sizeof(int16_t) * target)) - requires(memory_no_alias(buf, buflen)) - requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * target)) - ensures(offset <= return_value && return_value <= target) - ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) -) -{ - unsigned int ctr, pos; - uint16_t val0, val1; - - debug_assert_bound(r, offset, 0, MLKEM_Q); - - ctr = offset; - pos = 0; - /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */ - while (ctr < target && pos + 3 <= buflen) - __loop__( - invariant(offset <= ctr && ctr <= target && pos <= buflen) - invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q))) - { - val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; - val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; - pos += 3; - - if (val0 < MLKEM_Q) - { - r[ctr++] = val0; - } - if (ctr < target && val1 < MLKEM_Q) - { - r[ctr++] = val1; - } - } - - debug_assert_bound(r, ctr, 0, MLKEM_Q); - return ctr; -} - -#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM) -/************************************************* - * Name: rej_uniform - * - * Description: Run rejection sampling on uniform random bytes to generate - * uniform random integers mod q - * - * Arguments: - int16_t *r: pointer to output buffer - * - unsigned int target: requested number of 16-bit integers - * (uniform mod q). - * Must be <= 4096. - * - unsigned int offset: number of 16-bit integers that have - * already been sampled. - * Must be <= target. - * - const uint8_t *buf: pointer to input buffer - * (assumed to be uniform random bytes) - * - unsigned int buflen: length of input buffer in bytes - * Must be <= 4096. - * Must be a multiple of 3. - * - * Note: Strictly speaking, only a few values of buflen near UINT_MAX need - * excluding. The limit of 4096 is somewhat arbitary but sufficient for all - * uses of this function. Similarly, the actual limit for target is UINT_MAX/2. - * - * Returns the new offset of sampled 16-bit integers, at most target, - * and at least the initial offset. - * If the new offset is strictly less than len, all of the input buffers - * is guaranteed to have been consumed. If it is equal to len, no information - * is provided on how many bytes of the input buffer have been consumed. - **************************************************/ - -/* - * NOTE: The signature differs from the Kyber reference implementation - * in that it adds the offset and always expects the base of the target - * buffer. This avoids shifting the buffer base in the caller, which appears - * tricky to reason about. - */ -static unsigned int rej_uniform(int16_t *r, unsigned int target, - unsigned int offset, const uint8_t *buf, - unsigned int buflen) -__contract__( - requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) - requires(memory_no_alias(r, sizeof(int16_t) * target)) - requires(memory_no_alias(buf, buflen)) - requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * target)) - ensures(offset <= return_value && return_value <= target) - ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) -) -{ - return rej_uniform_scalar(r, target, offset, buf, buflen); -} -#else /* MLKEM_USE_NATIVE_REJ_UNIFORM */ -static unsigned int rej_uniform(int16_t *r, unsigned int target, - unsigned int offset, const uint8_t *buf, - unsigned int buflen) -{ - int ret; - - /* Sample from large buffer with full lane as much as possible. */ - ret = rej_uniform_native(r + offset, target - offset, buf, buflen); - if (ret != -1) - { - unsigned res = offset + (unsigned)ret; - debug_assert_bound(r, res, 0, MLKEM_Q); - return res; - } - - return rej_uniform_scalar(r, target, offset, buf, buflen); -} -#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ - -#ifndef MLKEM_GEN_MATRIX_NBLOCKS -#define MLKEM_GEN_MATRIX_NBLOCKS \ - ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE) -#endif - -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) -{ - /* Temporary buffers for XOF output before rejection sampling */ - uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - - /* Tracks the number of coefficients we have already sampled */ - unsigned int ctr[KECCAK_WAY]; - xof_x4_ctx statex; - unsigned int buflen; - - shake128x4_inc_init(&statex); - - /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */ - xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3], - MLKEM_SYMBYTES + 2); - - /* - * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. - * This should generate the matrix entries with high probability. - */ - xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS, - &statex); - buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; - ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen); - ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen); - ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen); - ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen); - - /* - * So long as not all matrix entries have been generated, squeeze - * one more block a time until we're done. - */ - buflen = XOF_RATE; - while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N || - ctr[3] < MLKEM_N) - __loop__( - assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0), - object_whole(buf1), object_whole(buf2), object_whole(buf3)) - invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N) - invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N) - invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q)) - invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q)) - invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q)) - invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q))) - { - xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex); - ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen); - ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen); - ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen); - ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen); - } - - xof_x4_release(&statex); -} - -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) -{ - xof_ctx state; - uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - unsigned int ctr, buflen; - - shake128_inc_init(&state); - - xof_absorb(&state, seed, MLKEM_SYMBYTES + 2); - - /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. - */ - /* This should generate the matrix entry with high probability. */ - xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state); - buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; - ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen); - - /* Squeeze + sample one more block a time until we're done */ - buflen = XOF_RATE; - while (ctr < MLKEM_N) - __loop__( - assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf)) - invariant(ctr <= MLKEM_N) - invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q))) - { - xof_squeezeblocks(buf, 1, &state); - ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen); - } - - xof_release(&state); -} - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform) -int empty_cu_rej_uniform; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.h deleted file mode 100644 index 801287259..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef REJ_UNIFORM_H -#define REJ_UNIFORM_H - -#include -#include -#include "cbmc.h" -#include "common.h" -#include "poly.h" - -#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4) -/************************************************* - * Name: poly_rej_uniform_x4 - * - * Description: Generate four polynomials using rejection sampling - * on (pseudo-)uniformly random bytes sampled from a seed. - * - * Arguments: - poly *vec: Pointer to an array of 4 polynomials - * to be sampled. - * - uint8_t *seed[4]: Pointer to array of four pointers - * pointing to the seed buffers of size - * MLKEM_SYMBYTES + 2 each. - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) -__contract__( - requires(memory_no_alias(vec, sizeof(poly) * 4)) - requires(memory_no_alias(seed, sizeof(uint8_t*) * 4)) - requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2)) - requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2)) - requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2)) - requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2)) - assigns(memory_slice(vec, sizeof(poly) * 4)) - ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q))); - -#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform) -/************************************************* - * Name: poly_rej_uniform - * - * Description: Generate polynomial using rejection sampling - * on (pseudo-)uniformly random bytes sampled from a seed. - * - * Arguments: - poly *vec: Pointer to polynomial to be sampled. - * - uint8_t *seed: Pointer to seed buffer of size - * MLKEM_SYMBYTES + 2 each. - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) -__contract__( - requires(memory_no_alias(entry, sizeof(poly))) - requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2)) - assigns(memory_slice(entry, sizeof(poly))) - ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q))); - -#endif /* REJ_UNIFORM_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/sampling.c new file mode 100644 index 000000000..98cbdcb74 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/sampling.c @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "common.h" +#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) + +#include "arith_backend.h" +#include "debug.h" +#include "fips202.h" +#include "fips202x4.h" +#include "sampling.h" +#include "symmetric.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define rej_uniform MLKEM_NAMESPACE(rej_uniform) +#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar) +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) +/* End of static namespacing */ + +static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +__contract__( + requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) + requires(memory_no_alias(r, sizeof(int16_t) * target)) + requires(memory_no_alias(buf, buflen)) + requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * target)) + ensures(offset <= return_value && return_value <= target) + ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) +) +{ + unsigned int ctr, pos; + uint16_t val0, val1; + + debug_assert_bound(r, offset, 0, MLKEM_Q); + + ctr = offset; + pos = 0; + /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */ + while (ctr < target && pos + 3 <= buflen) + __loop__( + invariant(offset <= ctr && ctr <= target && pos <= buflen) + invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q))) + { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; + + if (val0 < MLKEM_Q) + { + r[ctr++] = val0; + } + if (ctr < target && val1 < MLKEM_Q) + { + r[ctr++] = val1; + } + } + + debug_assert_bound(r, ctr, 0, MLKEM_Q); + return ctr; +} + +#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM) +/************************************************* + * Name: rej_uniform + * + * Description: Run rejection sampling on uniform random bytes to generate + * uniform random integers mod q + * + * Arguments: - int16_t *r: pointer to output buffer + * - unsigned int target: requested number of 16-bit integers + * (uniform mod q). + * Must be <= 4096. + * - unsigned int offset: number of 16-bit integers that have + * already been sampled. + * Must be <= target. + * - const uint8_t *buf: pointer to input buffer + * (assumed to be uniform random bytes) + * - unsigned int buflen: length of input buffer in bytes + * Must be <= 4096. + * Must be a multiple of 3. + * + * Note: Strictly speaking, only a few values of buflen near UINT_MAX need + * excluding. The limit of 4096 is somewhat arbitary but sufficient for all + * uses of this function. Similarly, the actual limit for target is UINT_MAX/2. + * + * Returns the new offset of sampled 16-bit integers, at most target, + * and at least the initial offset. + * If the new offset is strictly less than len, all of the input buffers + * is guaranteed to have been consumed. If it is equal to len, no information + * is provided on how many bytes of the input buffer have been consumed. + **************************************************/ + +/* + * NOTE: The signature differs from the Kyber reference implementation + * in that it adds the offset and always expects the base of the target + * buffer. This avoids shifting the buffer base in the caller, which appears + * tricky to reason about. + */ +static unsigned int rej_uniform(int16_t *r, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +__contract__( + requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) + requires(memory_no_alias(r, sizeof(int16_t) * target)) + requires(memory_no_alias(buf, buflen)) + requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * target)) + ensures(offset <= return_value && return_value <= target) + ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) +) +{ + return rej_uniform_scalar(r, target, offset, buf, buflen); +} +#else /* MLKEM_USE_NATIVE_REJ_UNIFORM */ +static unsigned int rej_uniform(int16_t *r, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +{ + int ret; + + /* Sample from large buffer with full lane as much as possible. */ + ret = rej_uniform_native(r + offset, target - offset, buf, buflen); + if (ret != -1) + { + unsigned res = offset + (unsigned)ret; + debug_assert_bound(r, res, 0, MLKEM_Q); + return res; + } + + return rej_uniform_scalar(r, target, offset, buf, buflen); +} +#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ + +#ifndef MLKEM_GEN_MATRIX_NBLOCKS +#define MLKEM_GEN_MATRIX_NBLOCKS \ + ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE) +#endif + +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) +{ + /* Temporary buffers for XOF output before rejection sampling */ + uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + + /* Tracks the number of coefficients we have already sampled */ + unsigned int ctr[KECCAK_WAY]; + xof_x4_ctx statex; + unsigned int buflen; + + shake128x4_inc_init(&statex); + + /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */ + xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3], + MLKEM_SYMBYTES + 2); + + /* + * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. + * This should generate the matrix entries with high probability. + */ + xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS, + &statex); + buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; + ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen); + ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen); + ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen); + ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen); + + /* + * So long as not all matrix entries have been generated, squeeze + * one more block a time until we're done. + */ + buflen = XOF_RATE; + while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N || + ctr[3] < MLKEM_N) + __loop__( + assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0), + object_whole(buf1), object_whole(buf2), object_whole(buf3)) + invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N) + invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N) + invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q)) + invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q)) + invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q)) + invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q))) + { + xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex); + ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen); + ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen); + ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen); + ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen); + } + + xof_x4_release(&statex); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) +{ + xof_ctx state; + uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + unsigned int ctr, buflen; + + shake128_inc_init(&state); + + xof_absorb(&state, seed, MLKEM_SYMBYTES + 2); + + /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. + */ + /* This should generate the matrix entry with high probability. */ + xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state); + buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; + ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen); + + /* Squeeze + sample one more block a time until we're done */ + buflen = XOF_RATE; + while (ctr < MLKEM_N) + __loop__( + assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf)) + invariant(ctr <= MLKEM_N) + invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q))) + { + xof_squeezeblocks(buf, 1, &state); + ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen); + } + + xof_release(&state); +} + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) +/* End of static namespacing */ + +/************************************************* + * Name: load32_littleendian + * + * Description: load 4 bytes into a 32-bit integer + * in little-endian order + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x + **************************************************/ +static uint32_t load32_littleendian(const uint8_t x[4]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + r |= (uint32_t)x[3] << 24; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) + { + unsigned j; + uint32_t t = load32_littleendian(buf + 4 * i); + uint32_t d = t & 0x55555555; + d += (t >> 1) & 0x55555555; + + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) + { + const int16_t a = (d >> (4 * j + 0)) & 0x3; + const int16_t b = (d >> (4 * j + 2)) & 0x3; + r->coeffs[8 * i + j] = a - b; + } + } +} + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +/************************************************* + * Name: load24_littleendian + * + * Description: load 3 bytes into a 32-bit integer + * in little-endian order. + * This function is only needed for ML-KEM-512 + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) + **************************************************/ +static uint32_t load24_littleendian(const uint8_t x[3]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 4; i++) + __loop__( + invariant(i <= MLKEM_N / 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) + { + unsigned j; + const uint32_t t = load24_littleendian(buf + 3 * i); + uint32_t d = t & 0x00249249; + d += (t >> 1) & 0x00249249; + d += (t >> 2) & 0x00249249; + + for (j = 0; j < 4; j++) + __loop__( + invariant(i <= MLKEM_N / 4 && j <= 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) + { + const int16_t a = (d >> (6 * j + 0)) & 0x7; + const int16_t b = (d >> (6 * j + 3)) & 0x7; + r->coeffs[4 * i + j] = a - b; + } + } +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ + 3 */ + +#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ + +#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling) +int empty_cu_sampling; + +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/sampling.h new file mode 100644 index 000000000..cc524e0fc --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/sampling.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef SAMPLING_H +#define SAMPLING_H + +#include +#include +#include "cbmc.h" +#include "common.h" +#include "poly.h" + +#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) +/************************************************* + * Name: poly_cbd2 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=2 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) +/************************************************* + * Name: poly_cbd3 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=3. + * This function is only needed for ML-KEM-512 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ + +#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4) +/************************************************* + * Name: poly_rej_uniform_x4 + * + * Description: Generate four polynomials using rejection sampling + * on (pseudo-)uniformly random bytes sampled from a seed. + * + * Arguments: - poly *vec: Pointer to an array of 4 polynomials + * to be sampled. + * - uint8_t *seed[4]: Pointer to array of four pointers + * pointing to the seed buffers of size + * MLKEM_SYMBYTES + 2 each. + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) +__contract__( + requires(memory_no_alias(vec, sizeof(poly) * 4)) + requires(memory_no_alias(seed, sizeof(uint8_t*) * 4)) + requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2)) + requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2)) + requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2)) + requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2)) + assigns(memory_slice(vec, sizeof(poly) * 4)) + ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q))); + +#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform) +/************************************************* + * Name: poly_rej_uniform + * + * Description: Generate polynomial using rejection sampling + * on (pseudo-)uniformly random bytes sampled from a seed. + * + * Arguments: - poly *vec: Pointer to polynomial to be sampled. + * - uint8_t *seed: Pointer to seed buffer of size + * MLKEM_SYMBYTES + 2 each. + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) +__contract__( + requires(memory_no_alias(entry, sizeof(poly))) + requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2)) + assigns(memory_slice(entry, sizeof(poly))) + ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q))); + +#endif /* SAMPLING_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c index 4ef887c62..987f0dce4 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c @@ -10,7 +10,7 @@ #include "common.h" #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) -#include "ntt.h" +#include "poly.h" /* * Table of zeta values used in the reference NTT and inverse NTT. diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/api.h deleted file mode 100644 index 792ecb8a4..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/api.h +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * Native arithmetic interface - * - * This header is primarily for documentation purposes. - * It should not be included by backend implementations. - * - * To ensure consistency with backends, the header will be - * included automatically after inclusion of the active - * backend, to ensure consistency of function signatures, - * and run sanity checks. - */ -#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H -#error \ - "The arithmetic backend API `mlkem/native/api.h` " \ - "should not be directly included. Please include the relevant " \ - "structure headers directly." -#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ -#define MLKEM_NATIVE_ARITH_NATIVE_API_H - -#include -#include "poly.h" -#include "polyvec.h" - -/* - * This is the C<->native interface allowing for the drop-in of - * native code for performance critical arithmetic components of ML-KEM. - * - * A _backend_ is a specific implementation of (part of) this interface. - * - * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and - * implement `static inline xxx(...)` in the profile header. - * - * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can - * be set if there are native implementations for all of NTT, invNTT, and - * base multiplication, and allows the native implementation to use a - * custom order of polynomial coefficients in NTT domain -- the use of such - * custom order is not an implementation-detail since the public matrix - * is generated in NTT domain. In this case, a permutation function - * poly_permute_bitrev_to_custom() needs to be provided that permutes - * polynomials in NTT domain from bitreversed to the custom order. - */ - -/* - * Those functions are meant to be trivial wrappers around the chosen native - * implementation. The are static inline to avoid unnecessary calls. - * The macro before each declaration controls whether a native - * implementation is present. - */ - -#if defined(MLKEM_USE_NATIVE_NTT) -/************************************************* - * Name: ntt_native - * - * Description: Computes negacyclic number-theoretic transform (NTT) of - * a polynomial in place. - * - * The input polynomial is assumed to be in normal order. - * The output polynomial is in bitreversed order, or of a - * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * - * Arguments: - poly *p: pointer to in/output polynomial - **************************************************/ -static INLINE void ntt_native(poly *); -#endif /* MLKEM_USE_NATIVE_NTT */ - -#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER) -/* - * This must only be set if NTT, invNTT, basemul, mulcache, and - * to/from byte stream conversions all have native implementations - * that are adapted to the custom order. - */ -#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \ - !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ - !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) || \ - !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) || \ - !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -#error \ - "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \ -set if there are native implementations for NTT, invNTT, mulcache, basemul, \ -and to/from bytes conversions." -#endif - -/************************************************* - * Name: poly_permute_bitrev_to_custom - * - * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined, - * convert a polynomial in NTT domain from bitreversed - * order to the custom order output by the native NTT. - * - * This must only be defined if there is native code for - * all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache. - * Arguments: - poly *p: pointer to in/output polynomial - * - **************************************************/ -static INLINE void poly_permute_bitrev_to_custom(poly *); -#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ - -#if defined(MLKEM_USE_NATIVE_INTT) -/************************************************* - * Name: intt_native - * - * Description: Computes inverse of negacyclic number-theoretic transform (NTT) - * of a polynomial in place. - * - * The input polynomial is in bitreversed order, or of a - * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * The output polynomial is assumed to be in normal order. - * - * Arguments: - uint16_t *a: pointer to in/output polynomial - **************************************************/ -static INLINE void intt_native(poly *); -#endif /* MLKEM_USE_NATIVE_INTT */ - -#if defined(MLKEM_USE_NATIVE_POLY_REDUCE) -/************************************************* - * Name: poly_reduce_native - * - * Description: Applies modular reduction to all coefficients of a polynomial. - * - * Arguments: - poly *r: pointer to input/output polynomial - **************************************************/ -static INLINE void poly_reduce_native(poly *); -#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ - -#if defined(MLKEM_USE_NATIVE_POLY_TOMONT) -/************************************************* - * Name: poly_tomont_native - * - * Description: Inplace conversion of all coefficients of a polynomial - * from normal domain to Montgomery domain - * - * Arguments: - poly *r: pointer to input/output polynomial - **************************************************/ -static INLINE void poly_tomont_native(poly *); -#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ - -#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) -/************************************************* - * Name: poly_mulcache_compute_native - * - * Description: Compute multiplication cache for a polynomial - * in NTT domain. - * - * The purpose of the multiplication cache is to - * cache repeated computations required during a - * base multiplication of polynomials in NTT domain. - * The structure of the multiplication-cache is - * implementation defined. - * - * Arguments: INPUT: - * - poly: const pointer to input polynomial. - * This must be in NTT domain and inin bitreversed order, or of - * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * OUTPUT - * - cache: pointer to multiplication cache - **************************************************/ -static INLINE void poly_mulcache_compute_native(poly_mulcache *cache, - const poly *poly); -#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ - -#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) -/************************************************* - * Name: poly_mulcache_compute_native - * - * Description: Compute multiplication of polynomials in NTT domain. - * - * Arguments: INPUT: - * - a: First polynomial operand. - * This must be in NTT domain and inin bitreversed order, or of - * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * - b: Second polynomial operand. - * As for a. - * - b_cache: Multiplication-cache for b. - * OUTPUT - * - r: Result of the base multiplication. This is again - * in NTT domain, and of the same order as a and b. - **************************************************/ -static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache); -#endif - -#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES) -/************************************************* - * Name: poly_tobytes_native - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - a: const pointer to input polynomial, - * with each coefficient in the range -Q+1 .. Q-1 - * OUTPUT - * - r: pointer to output byte array - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a); -#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ - -#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -/************************************************* - * Name: poly_frombytes_native - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - r: pointer to output polynomial in NTT domain - * OUTPUT - * - a: const pointer to input byte aray - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -static INLINE void poly_frombytes_native(poly *a, - const uint8_t r[MLKEM_POLYBYTES]); -#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ - -#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM) -/************************************************* - * Name: rej_uniform_native - * - * Description: Run rejection sampling on uniform random bytes to generate - * uniform random integers mod q - * - * Arguments: - int16_t *r: pointer to output buffer - * - unsigned int len: requested number of 16-bit integers - * (uniform mod q). - * - const uint8_t *buf: pointer to input buffer - * (assumed to be uniform random bytes) - * - unsigned int buflen: length of input buffer in bytes. - * - * Return -1 if the native implementation does not support the input lengths. - * Otherwise, returns non-negative number of sampled 16-bit integers (at most - * len). - **************************************************/ -static INLINE int rej_uniform_native(int16_t *r, unsigned int len, - const uint8_t *buf, unsigned int buflen); -#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ - -#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_backend.h index 0543b1bd1..ade31cda1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_backend.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_backend.h @@ -17,7 +17,7 @@ * Keep this _after_ the inclusion of the backend; otherwise, * the sanity checks won't have an effect. */ #if defined(MLKEM_NATIVE_CHECK_APIS) -#include "api.h" +#include "native/api.h" #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.c deleted file mode 100644 index 1e6b7c5d1..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED - -#include -#include "cbd.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) -#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) -/* End of static namespacing */ - -/************************************************* - * Name: load32_littleendian - * - * Description: load 4 bytes into a 32-bit integer - * in little-endian order - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x - **************************************************/ -static uint32_t load32_littleendian(const uint8_t x[4]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - r |= (uint32_t)x[3] << 24; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) - { - unsigned j; - uint32_t t = load32_littleendian(buf + 4 * i); - uint32_t d = t & 0x55555555; - d += (t >> 1) & 0x55555555; - - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) - { - const int16_t a = (d >> (4 * j + 0)) & 0x3; - const int16_t b = (d >> (4 * j + 2)) & 0x3; - r->coeffs[8 * i + j] = a - b; - } - } -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -/************************************************* - * Name: load24_littleendian - * - * Description: load 3 bytes into a 32-bit integer - * in little-endian order. - * This function is only needed for ML-KEM-512 - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) - **************************************************/ -static uint32_t load24_littleendian(const uint8_t x[3]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 4; i++) - __loop__( - invariant(i <= MLKEM_N / 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) - { - unsigned j; - const uint32_t t = load24_littleendian(buf + 3 * i); - uint32_t d = t & 0x00249249; - d += (t >> 1) & 0x00249249; - d += (t >> 2) & 0x00249249; - - for (j = 0; j < 4; j++) - __loop__( - invariant(i <= MLKEM_N / 4 && j <= 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) - { - const int16_t a = (d >> (6 * j + 0)) & 0x7; - const int16_t b = (d >> (6 * j + 3)) & 0x7; - r->coeffs[4 * i + j] = a - b; - } - } -} -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ - 3 */ - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd) -int empty_cu_cbd; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.h deleted file mode 100644 index 54c1f5b90..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef CBD_H -#define CBD_H - -#include -#include "common.h" -#include "poly.h" - -#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) -/************************************************* - * Name: poly_cbd2 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=2 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) -/************************************************* - * Name: poly_cbd3 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=3. - * This function is only needed for ML-KEM-512 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ - -#endif /* CBD_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h index 4f326333e..62ed53ab1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h @@ -15,12 +15,19 @@ #include "sys.h" /* Include backend metadata */ -#if defined(MLKEM_USE_NATIVE) -#if defined(MLKEM_NATIVE_ARITH_BACKEND) -#include MLKEM_NATIVE_ARITH_BACKEND +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#include MLKEM_NATIVE_ARITH_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not. +#endif #endif -#if defined(MLKEM_NATIVE_FIPS202_BACKEND) -#include MLKEM_NATIVE_FIPS202_BACKEND + +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#include MLKEM_NATIVE_FIPS202_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not. #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/compress.c new file mode 100644 index 000000000..a03fe0ac4 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/compress.c @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "common.h" +#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) + +#include +#include +#include "arith_backend.h" +#include "cbmc.h" +#include "compress.h" +#include "debug.h" +#include "verify.h" + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 16))) + { + t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); + } + + r[i * 4] = t[0] | (t[1] << 4); + r[i * 4 + 1] = t[2] | (t[3] << 4); + r[i * 4 + 2] = t[4] | (t[5] << 4); + r[i * 4 + 3] = t[6] | (t[7] << 4); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + for (j = 0; j < MLKEM_N / 4; j++) + __loop__(invariant(j <= MLKEM_N / 4)) + { + unsigned k; + uint16_t t[4]; + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(forall(r, 0, k, t[r] < (1u << 10)))) + { + t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 10-bit in size. + */ + r[5 * j + 0] = (t[0] >> 0) & 0xFF; + r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); + r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); + r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); + r[5 * j + 4] = (t[3] >> 2); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) + { + r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); + r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 4; j++) + __loop__( + invariant(j <= MLKEM_N / 4) + invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[4]; + uint8_t const *base = &a[5 * j]; + + t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); + t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); + t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); + + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) + { + r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 32))) + { + t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); + } + + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC, and use array indexing into + * r rather than pointer-arithmetic to simplify verification + */ + r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); + r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); + r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); + r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); + r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (j = 0; j < MLKEM_N / 8; j++) + __loop__(invariant(j <= MLKEM_N / 8)) + { + unsigned k; + uint16_t t[8]; + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(forall(r, 0, k, t[r] < (1u << 11)))) + { + t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 11-bit in size. + */ + r[11 * j + 0] = (t[0] >> 0) & 0xFF; + r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); + r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); + r[11 * j + 3] = (t[2] >> 2) & 0xFF; + r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); + r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); + r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); + r[11 * j + 7] = (t[5] >> 1) & 0xFF; + r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); + r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); + r[11 * j + 10] = (t[7] >> 3); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + uint8_t t[8]; + const unsigned offset = i * 5; + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC and unwind loop for ease + * of proof. + */ + + /* + * Decompress 5 8-bit bytes (so 40 bits) into + * 8 5-bit values stored in t[] + */ + t[0] = 0x1F & (a[offset + 0] >> 0); + t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); + t[2] = 0x1F & (a[offset + 1] >> 2); + t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); + t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); + t[5] = 0x1F & (a[offset + 3] >> 1); + t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); + t[7] = 0x1F & (a[offset + 4] >> 3); + + /* and copy to the correct slice in r[] */ + for (j = 0; j < 8; j++) + __loop__( + invariant(j <= 8 && i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 8; j++) + __loop__( + invariant(j <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[8]; + uint8_t const *base = &a[11 * j]; + t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); + t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | + ((uint16_t)base[4] << 10)); + t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); + t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); + t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | + ((uint16_t)base[8] << 9)); + t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); + t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); + + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) + { + r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ + +#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 2; i++) + __loop__(invariant(i <= MLKEM_N / 2)) + { + const uint16_t t0 = a->coeffs[2 * i]; + const uint16_t t1 = a->coeffs[2 * i + 1]; + /* + * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of + * significant data, so these can be packed into 24 bits or exactly + * 3 bytes, as follows. + */ + + /* Least significant bits 0 - 7 of t0. */ + r[3 * i + 0] = t0 & 0xFF; + + /* + * Most significant bits 8 - 11 of t0 become the least significant + * nibble of the second byte. The least significant 4 bits + * of t1 become the upper nibble of the second byte. + */ + r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + + /* Bits 4 - 11 of t1 become the third byte. */ + r[3 * i + 2] = t1 >> 4; + } +} +#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + poly_tobytes_native(r, a->coeffs); +} +#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + +#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) + { + const uint8_t t0 = a[3 * i + 0]; + const uint8_t t1 = a[3 * i + 1]; + const uint8_t t2 = a[3 * i + 2]; + r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); + r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); + } + + /* Note that the coefficients are not canonical */ + debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); +} +#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + poly_frombytes_native(r->coeffs, a); +} +#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ + +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +{ + unsigned i; +#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) +#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" +#endif + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + for (j = 0; j < 8; j++) + __loop__( + invariant(i < MLKEM_N / 8 && j <= 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + /* Prevent the compiler from recognizing this as a bit selection */ + uint8_t mask = value_barrier_u8(1u << j); + r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); + } + } + debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + msg[i] = 0; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8)) + { + uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); + msg[i] |= t << j; + } + } +} + +#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ + +#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress) +int empty_cu_compress; + +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/compress.h new file mode 100644 index 000000000..409dbe519 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/compress.h @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef COMPRESS_H +#define COMPRESS_H + +#include +#include +#include "cbmc.h" +#include "common.h" +#include "debug.h" +#include "poly.h" +#include "verify.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) +#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) +#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) +#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) +#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) +#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) +#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) +#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) +#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) +/* End of static namespacing */ + +/************************************************************ + * Name: scalar_compress_d1 + * + * Description: Computes round(u * 2 / q) + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 1. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d1(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 2) + ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) +{ + uint32_t d0 = u << 1; + d0 *= 645083; + d0 += 1u << 30; + d0 >>= 31; + return d0; +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_compress_d4 + * + * Description: Computes round(u * 16 / q) % 16 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d4(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 16) + ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) +{ + uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ + return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d4 + * + * Description: Computes round(u * q / 16) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d4(uint32_t u) +__contract__( + requires(0 <= u && u < 16) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 8) / 16; } + +/************************************************************ + * Name: scalar_compress_d5 + * + * Description: Computes round(u * 32 / q) % 32 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d5(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 32) + ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) +{ + uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ + return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d5 + * + * Description: Computes round(u * q / 32) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo 32 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d5(uint32_t u) +__contract__( + requires(0 <= u && u < 32) + ensures(return_value <= MLKEM_Q - 1) +) { return ((u * MLKEM_Q) + 16) / 32; } + +/************************************************************ + * Name: scalar_compress_d10 + * + * Description: Computes round(u * 2**10 / q) % 2**10 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d10(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 10)) + ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) +{ + uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x3FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d10 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d10(uint32_t u) +__contract__( + requires(0 <= u && u < 1024) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 512) / 1024; } + +/************************************************************ + * Name: scalar_compress_d11 + * + * Description: Computes round(u * 2**11 / q) % 2**11 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 11. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d11(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 11)) + ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) +{ + uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x7FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d11 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d11(uint32_t u) +__contract__( + requires(0 <= u && u < 2048) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 1024) / 2048; } + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ + (MLKEM_K == 2 || MLKEM_K == 3) +#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) +/************************************************* + * Name: poly_compress_d4 + * + * Description: Compression (4 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); + +#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) +/************************************************* + * Name: poly_compress_d10 + * + * Description: Compression (10 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); + +#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) +/************************************************* + * Name: poly_decompress_d4 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); + +#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) +/************************************************* + * Name: poly_decompress_d10 + * + * Description: De-serialization and subsequent decompression (10 bits) of a + * polynomial; approximate inverse of poly_compress_d10 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) +/************************************************* + * Name: poly_compress_d5 + * + * Description: Compression (5 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); + +#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) +/************************************************* + * Name: poly_compress_d11 + * + * Description: Compression (11 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); + +#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) +/************************************************* + * Name: poly_decompress_d5 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); + +#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) +/************************************************* + * Name: poly_decompress_d11 + * + * Description: De-serialization and subsequent decompression (11 bits) of a + * polynomial; approximate inverse of poly_compress_d11 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ + */ + +#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) +/************************************************* + * Name: poly_tobytes + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - a: const pointer to input polynomial, + * with each coefficient in the range [0,1,..,Q-1] + * OUTPUT + * - r: pointer to output byte array + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYBYTES)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(r)) +); + + +#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) +/************************************************* + * Name: poly_frombytes + * + * Description: De-serialization of a polynomial. + * + * Arguments: INPUT + * - a: pointer to input byte array + * (of MLKEM_POLYBYTES bytes) + * OUTPUT + * - r: pointer to output polynomial, with + * each coefficient unsigned and in the range + * 0 .. 4095 + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) +); + + +#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) +/************************************************* + * Name: poly_frommsg + * + * Description: Convert 32-byte message to polynomial + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *msg: pointer to input message + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(object_whole(r)) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) +); + +#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) +/************************************************* + * Name: poly_tomsg + * + * Description: Convert polynomial to 32-byte message + * + * Arguments: - uint8_t *msg: pointer to output message + * - const poly *r: pointer to input polynomial + * Coefficients must be unsigned canonical + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(msg)) +); + +#endif /* COMPRESS_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h index fa89370ce..e975ede95 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h @@ -122,46 +122,87 @@ /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ /****************************************************************************** - * Name: MLKEM_USE_NATIVE + * Name: MLKEM_USE_NATIVE_BACKEND_ARITH * - * Description: Determines whether a native backend should - * be used, if available. + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. * * This can also be set using CFLAGS. * *****************************************************************************/ -#if !defined(MLKEM_USE_NATIVE) -/* #define MLKEM_USE_NATIVE */ +#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */ #endif /****************************************************************************** - * Name: MLKEM_NATIVE_ARITH_BACKEND + * Name: MLKEM_NATIVE_ARITH_BACKEND_FILE * * Description: The arithmetic backend to use. * - * This must be the filename of an arithmetic backend. - * See the existing backends for examples. + * If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND) -#define MLKEM_NATIVE_ARITH_BACKEND "default.h" -#endif /* MLKEM_NATIVE_ARITH_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \ + !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h" +#endif /****************************************************************************** - * Name: MLKEM_NATIVE_FIPS202_BACKEND + * Name: MLKEM_USE_NATIVE_BACKEND_FIPS202 + * + * Description: Determines whether an native FIPS202 backend should be used. + * + * The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is + * the performance bottleneck of SHA3 and SHAKE. + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the FIPS202 backend to be use is + * determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */ +#endif + +/****************************************************************************** + * Name: MLKEM_NATIVE_FIPS202_BACKEND_FILE * * Description: The FIPS-202 backend to use. * - * This must be the filename of an FIPS-202 backend. + * If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must + * either be undefined or the filename of a FIPS202 backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND) -#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h" -#endif /* MLKEM_NATIVE_FIPS202_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \ + !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h" +#endif /************************* Config internals ********************************/ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/default.h deleted file mode 100644 index d1e41c52e..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/default.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H -#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H - -/* - * Default arithmetic backend - */ -#include "sys.h" - -#ifdef SYS_AARCH64 -/* - * For AArch64, we currently we have one clean and one opt profile. - * We default to the opt profile. - * - * In the future, this may branch further depending on the microarchitecture. - */ -#include "aarch64/opt.h" -#endif /* SYS_AARCH64 */ - -#ifdef SYS_X86_64_AVX2 -/* - * For now, there's only one x86_64 profile, based on - * the AVX2 code from the Kyber repository. - * https://github.com/pq-crystals/kyber - */ -#include "x86_64/default.h" -#endif /* SYS_X86_64 */ - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c index 0cfcc3e9e..318d0fc77 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c @@ -9,11 +9,10 @@ #include "fips202.h" #include "fips202x4.h" #include "indcpa.h" -#include "ntt.h" #include "poly.h" -#include "polyvec.h" +#include "poly_k.h" #include "randombytes.h" -#include "rej_uniform.h" +#include "sampling.h" #include "symmetric.h" #include "arith_backend.h" @@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v, #define poly_permute_bitrev_to_custom \ MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom) -static INLINE void poly_permute_bitrev_to_custom(poly *data) +static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N]) __contract__( /* We don't specify that this should be a permutation, but only * that it does not change the bound established at the end of gen_matrix. */ - requires(memory_no_alias(data, sizeof(poly))) - requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N)) + requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q)) assigns(memory_slice(data, sizeof(poly))) - ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } + ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ /* Not static for benchmarking */ @@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed) { for (j = 0; j < MLKEM_K; j++) { - poly_permute_bitrev_to_custom(&a[i].vec[j]); + poly_permute_bitrev_to_custom(a[i].vec[j].coeffs); } } } diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h index 2c4fda3c4..b4d5985bf 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h @@ -8,7 +8,7 @@ #include #include "cbmc.h" #include "common.h" -#include "polyvec.h" +#include "poly_k.h" #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix) /************************************************* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/api.h new file mode 100644 index 000000000..0704f9dcd --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/api.h @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * Native arithmetic interface + * + * This header is primarily for documentation purposes. + * It should not be included by backend implementations. + * + * To ensure consistency with backends, the header will be + * included automatically after inclusion of the active + * backend, to ensure consistency of function signatures, + * and run sanity checks. + */ +#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H +#error \ + "The arithmetic backend API `mlkem/native/api.h` " \ + "should not be directly included. Please include the relevant " \ + "structure headers directly." +#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ +#define MLKEM_NATIVE_ARITH_NATIVE_API_H + +#include +#include "../common.h" + +/* + * This is the C<->native interface allowing for the drop-in of + * native code for performance critical arithmetic components of ML-KEM. + * + * A _backend_ is a specific implementation of (part of) this interface. + * + * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and + * implement `static inline xxx(...)` in the profile header. + * + * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can + * be set if there are native implementations for all of NTT, invNTT, and + * base multiplication, and allows the native implementation to use a + * custom order of polynomial coefficients in NTT domain -- the use of such + * custom order is not an implementation-detail since the public matrix + * is generated in NTT domain. In this case, a permutation function + * poly_permute_bitrev_to_custom() needs to be provided that permutes + * polynomials in NTT domain from bitreversed to the custom order. + */ + +/* + * Those functions are meant to be trivial wrappers around the chosen native + * implementation. The are static inline to avoid unnecessary calls. + * The macro before each declaration controls whether a native + * implementation is present. + */ + +#if defined(MLKEM_USE_NATIVE_NTT) +/************************************************* + * Name: ntt_native + * + * Description: Computes negacyclic number-theoretic transform (NTT) of + * a polynomial in place. + * + * The input polynomial is assumed to be in normal order. + * The output polynomial is in bitreversed order, or of a + * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial + **************************************************/ +static INLINE void ntt_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_NTT */ + +#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER) +/* + * This must only be set if NTT, invNTT, basemul, mulcache, and + * to/from byte stream conversions all have native implementations + * that are adapted to the custom order. + */ +#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) || \ + !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) || \ + !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +#error \ + "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \ +set if there are native implementations for NTT, invNTT, mulcache, basemul, \ +and to/from bytes conversions." +#endif + +/************************************************* + * Name: poly_permute_bitrev_to_custom + * + * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined, + * convert a polynomial in NTT domain from bitreversed + * order to the custom order output by the native NTT. + * + * This must only be defined if there is native code for + * all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache. + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial + * + **************************************************/ +static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ + +#if defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: intt_native + * + * Description: Computes inverse of negacyclic number-theoretic transform (NTT) + * of a polynomial in place. + * + * The input polynomial is in bitreversed order, or of a + * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * The output polynomial is assumed to be in normal order. + * + * Arguments: - uint16_t *a: pointer to in/output polynomial + **************************************************/ +static INLINE void intt_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_INTT */ + +#if defined(MLKEM_USE_NATIVE_POLY_REDUCE) +/************************************************* + * Name: poly_reduce_native + * + * Description: Applies modular reduction to all coefficients of a polynomial. + * + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial + **************************************************/ +static INLINE void poly_reduce_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ + +#if defined(MLKEM_USE_NATIVE_POLY_TOMONT) +/************************************************* + * Name: poly_tomont_native + * + * Description: Inplace conversion of all coefficients of a polynomial + * from normal domain to Montgomery domain + * + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial + **************************************************/ +static INLINE void poly_tomont_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ + +#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) +/************************************************* + * Name: poly_mulcache_compute_native + * + * Description: Compute multiplication cache for a polynomial + * in NTT domain. + * + * The purpose of the multiplication cache is to + * cache repeated computations required during a + * base multiplication of polynomials in NTT domain. + * The structure of the multiplication-cache is + * implementation defined. + * + * Arguments: INPUT: + * - poly: const pointer to input polynomial. + * This must be in NTT domain and inin bitreversed order, or of + * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * OUTPUT + * - cache: pointer to multiplication cache + **************************************************/ +static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2], + const int16_t poly[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ + +#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) +/************************************************* + * Name: poly_mulcache_compute_native + * + * Description: Compute multiplication of polynomials in NTT domain. + * + * Arguments: INPUT: + * - a: First polynomial operand. + * This must be in NTT domain and inin bitreversed order, or of + * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * - b: Second polynomial operand. + * As for a. + * - b_cache: Multiplication-cache for b. + * OUTPUT + * - r: Result of the base multiplication. This is again + * in NTT domain, and of the same order as a and b. + **************************************************/ +static INLINE void polyvec_basemul_acc_montgomery_cached_native( + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]); +#endif + +#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES) +/************************************************* + * Name: poly_tobytes_native + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - a: const pointer to input polynomial, + * with each coefficient in the range -Q+1 .. Q-1 + * OUTPUT + * - r: pointer to output byte array + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], + const int16_t a[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + +#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +/************************************************* + * Name: poly_frombytes_native + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - r: pointer to output polynomial in NTT domain + * OUTPUT + * - a: const pointer to input byte aray + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +static INLINE void poly_frombytes_native(int16_t a[MLKEM_N], + const uint8_t r[MLKEM_POLYBYTES]); +#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ + +#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM) +/************************************************* + * Name: rej_uniform_native + * + * Description: Run rejection sampling on uniform random bytes to generate + * uniform random integers mod q + * + * Arguments: - int16_t *r: pointer to output buffer + * - unsigned int len: requested number of 16-bit integers + * (uniform mod q). + * - const uint8_t *buf: pointer to input buffer + * (assumed to be uniform random bytes) + * - unsigned int buflen: length of input buffer in bytes. + * + * Return -1 if the native implementation does not support the input lengths. + * Otherwise, returns non-negative number of sampled 16-bit integers (at most + * len). + **************************************************/ +static INLINE int rej_uniform_native(int16_t *r, unsigned int len, + const uint8_t *buf, unsigned int buflen); +#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ + +#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/default.h new file mode 100644 index 000000000..f9fe4310a --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/default.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H +#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H + +/* + * Default arithmetic backend + */ +#include "../sys.h" + +#ifdef SYS_AARCH64 +/* + * For AArch64, we currently we have one clean and one opt profile. + * We default to the opt profile. + * + * In the future, this may branch further depending on the microarchitecture. + */ +#include "aarch64/opt.h" +#endif /* SYS_AARCH64 */ + +#ifdef SYS_X86_64_AVX2 +/* + * For now, there's only one x86_64 profile, based on + * the AVX2 code from the Kyber repository. + * https://github.com/pq-crystals/kyber + */ +#include "x86_64/default.h" +#endif /* SYS_X86_64 */ + +#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.c deleted file mode 100644 index 3651c8da9..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) - -#include -#include "arith_backend.h" -#include "debug.h" -#include "ntt.h" -#include "reduce.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) -#define ntt_layer MLKEM_NAMESPACE(ntt_layer) -#define invntt_layer MLKEM_NAMESPACE(invntt_layer) -/* End of static namespacing */ - -#if !defined(MLKEM_USE_NATIVE_NTT) -/* - * Computes a block CT butterflies with a fixed twiddle factor, - * using Montgomery multiplication. - * Parameters: - * - r: Pointer to base of polynomial (_not_ the base of butterfly block) - * - root: Twiddle factor to use for the butterfly. This must be in - * Montgomery form and signed canonical. - * - start: Offset to the beginning of the butterfly block - * - len: Index difference between coefficients subject to a butterfly - * - bound: Ghost variable describing coefficient bound: Prior to `start`, - * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, - * they must be bound by `bound`. - * When this function returns, output coefficients in the index range - * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. - * Example: - * - start=8, len=4 - * This would compute the following four butterflies - * 8 -- 12 - * 9 -- 13 - * 10 -- 14 - * 11 -- 15 - * - start=4, len=2 - * This would compute the following two butterflies - * 4 -- 6 - * 5 -- 7 - */ -static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, - unsigned start, unsigned len, int bound) -__contract__( - requires(start < MLKEM_N) - requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) - requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) - requires(-HALF_Q < zeta && zeta < HALF_Q) - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) - requires(array_abs_bound(r, start, MLKEM_N, bound)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) - ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) -{ - /* `bound` is a ghost variable only needed in the CBMC specification */ - unsigned j; - ((void)bound); - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - /* - * Coefficients are updated in strided pairs, so the bounds for the - * intermediate states alternate twice between the old and new bound - */ - invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j, start + len, bound)) - invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) - { - int16_t t; - t = fqmul(r[j + len], zeta); - r[j + len] = r[j] - t; - r[j] = r[j] + t; - } -} - -/* - *Compute one layer of forward NTT - * Parameters: - * - r: Pointer to base of polynomial - * - len: Stride of butterflies in this layer. - * - layer: Ghost variable indicating which layer is being applied. - * Must match `len` via `len == MLKEM_N >> layer`. - * Note: `len` could be dropped and computed in the function, but - * we are following the structure of the reference NTT from the - * official Kyber implementation here, merely adding `layer` as - * a ghost variable for the specifications. - */ -static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) - requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable only needed in the CBMC specification */ - ((void)layer); - /* Twiddle factors for layer n start at index 2^(layer-1) */ - k = MLKEM_N / (2 * len); - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(start < MLKEM_N + 2 * len) - invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) - invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) - invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) - { - int16_t zeta = zetas[k++]; - ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); - } -} - -/* - * Compute full forward NTT - * NOTE: This particular implementation satisfies a much tighter - * bound on the output coefficients (5*q) than the contractual one (8*q), - * but this is not needed in the calling code. Should we change the - * base multiplication strategy to require smaller NTT output bounds, - * the proof may need strengthening. - */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - unsigned len, layer; - int16_t *r; - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - r = p->coeffs; - - for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) - __loop__( - invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) - invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) - { - ntt_layer(r, len, layer); - } - - /* Check the stronger bound */ - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_NTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - ntt_native(p); - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_NTT */ - -#if !defined(MLKEM_USE_NATIVE_INTT) - -/* Compute one layer of inverse NTT */ -static void invntt_layer(int16_t *r, unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) - requires(len == (1 << (8 - layer))) - requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable used only in the specification */ - ((void)layer); - k = MLKEM_N / len - 1; - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - invariant(start <= MLKEM_N && k <= 127) - /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ - invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) - { - unsigned j; - int16_t zeta = zetas[k--]; - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - invariant(start <= MLKEM_N && k <= 127) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - int16_t t = r[j]; - r[j] = barrett_reduce(t + r[j + len]); - r[j + len] = r[j + len] - t; - r[j + len] = fqmul(r[j + len], zeta); - } - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - /* - * Scale input polynomial to account for Montgomery factor - * and NTT twist. This also brings coefficients down to - * absolute value < MLKEM_Q. - */ - unsigned j, len, layer; - const int16_t f = 1441; - int16_t *r = p->coeffs; - - for (j = 0; j < MLKEM_N; j++) - __loop__( - invariant(j <= MLKEM_N) - invariant(array_abs_bound(r, 0, j, MLKEM_Q))) - { - r[j] = fqmul(r[j], f); - } - - /* Run the invNTT layers */ - for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) - __loop__( - invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - invntt_layer(p->coeffs, len, layer); - } - - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - intt_native(p); - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -{ - int32_t t0, t1; - debug_assert_bound(a, 2, 0, UINT12_LIMIT); - - t0 = (int32_t)a[1] * b_cached; - t0 += (int32_t)a[0] * b[0]; - t1 = (int32_t)a[0] * b[1]; - t1 += (int32_t)a[1] * b[0]; - - /* |ti| < 2 * q * 2^15 */ - r[0] = montgomery_reduce(t0); - r[1] = montgomery_reduce(t1); - - debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); -} - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt) -int empty_cu_ntt; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.h deleted file mode 100644 index 4e80d3ab3..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef NTT_H -#define NTT_H -#include "common.h" - -#include -#include "cbmc.h" -#include "poly.h" -#include "reduce.h" - -#define zetas MLKEM_NAMESPACE(zetas) -extern const int16_t zetas[128]; - -#define poly_ntt MLKEM_NAMESPACE(poly_ntt) -/************************************************* - * Name: poly_ntt - * - * Description: Computes negacyclic number-theoretic transform (NTT) of - * a polynomial in place. - * - * The input is assumed to be in normal order and - * coefficient-wise bound by MLKEM_Q in absolute value. - * - * The output polynomial is in bitreversed order, and - * coefficient-wise bound by NTT_BOUND in absolute value. - * - * (NOTE: Sometimes the input to the NTT is actually smaller, - * which gives better bounds.) - * - * Arguments: - poly *p: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) -); - -#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) -/************************************************* - * Name: poly_invntt_tomont - * - * Description: Computes inverse of negacyclic number-theoretic transform (NTT) - * of a polynomial in place; - * inputs assumed to be in bitreversed order, output in normal - * order - * - * The input is assumed to be in bitreversed order, and can - * have arbitrary coefficients in int16_t. - * - * The output polynomial is in normal order, and - * coefficient-wise bound by INVNTT_BOUND in absolute value. - * - * Arguments: - uint16_t *a: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) -); - -#define basemul_cached MLKEM_NAMESPACE(basemul_cached) -/************************************************************ - * Name: basemul_cached - * - * Description: Computes a representative modulo q of - * (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536 - * - * If b_cached is b1*zeta, this represents the - * product of (a0 + a1*X) and (b0 + b1*X) in - * Fq[X]/(X^2 - zeta). - * - * Arguments: - r: Pointer to output polynomial - * Upon return, coefficients are bound by - * 2*MLKEM_Q in absolute value. - * - a: Pointer to first input polynomial - * Every coefficient must be in [0..4095] - * - b: Pointer to second input polynomial - * Can have arbitrary int16_t coefficients - * - b_cached: Some precomputed value, typically derived from - * b1 and a twiddle factor. Can be an arbitary int16_t. - ************************************************************/ -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -__contract__( - requires(memory_no_alias(r, 2 * sizeof(int16_t))) - requires(memory_no_alias(a, 2 * sizeof(int16_t))) - requires(memory_no_alias(b, 2 * sizeof(int16_t))) - requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) - assigns(memory_slice(r, 2 * sizeof(int16_t))) - ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)) -); - -#endif /* NTT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h index 57ea4c8ba..7f6c12625 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h @@ -18,6 +18,7 @@ #define MLKEM_N 256 #define MLKEM_Q 3329 #define UINT12_LIMIT 4096 +#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define MLKEM_SSBYTES 32 /* size in bytes of shared key */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c index 7483ebf6d..e8a2e2c6e 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c @@ -8,388 +8,246 @@ #include #include #include "arith_backend.h" -#include "cbd.h" #include "cbmc.h" #include "debug.h" #include "fips202x4.h" -#include "ntt.h" #include "poly.h" -#include "reduce.h" +#include "sampling.h" #include "symmetric.h" #include "verify.h" -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 16))) - { - t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); - } - - r[i * 4] = t[0] | (t[1] << 4); - r[i * 4 + 1] = t[2] | (t[3] << 4); - r[i * 4 + 2] = t[4] | (t[5] << 4); - r[i * 4 + 3] = t[6] | (t[7] << 4); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - for (j = 0; j < MLKEM_N / 4; j++) - __loop__(invariant(j <= MLKEM_N / 4)) - { - unsigned k; - uint16_t t[4]; - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(forall(r, 0, k, t[r] < (1u << 10)))) - { - t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); - } - - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 10-bit in size. - */ - r[5 * j + 0] = (t[0] >> 0) & 0xFF; - r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); - r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); - r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); - r[5 * j + 4] = (t[3] >> 2); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) - { - r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); - r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) +#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) +#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) +#define fqmul MLKEM_NAMESPACE(fqmul) +#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) +#define basemul_cached MLKEM_NAMESPACE(basemul_cached) +#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) +#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) +#define ntt_layer MLKEM_NAMESPACE(ntt_layer) +#define invntt_layer MLKEM_NAMESPACE(invntt_layer) +/* End of static namespacing */ + +/************************************************* + * Name: cast_uint16_to_int16 + * + * Description: Cast uint16 value to int16 + * + * Returns: + * input x in 0 .. 32767: returns value unchanged + * input x in 32768 .. 65535: returns (x - 65536) + **************************************************/ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "conversion" +#endif +ALWAYS_INLINE +static INLINE int16_t cast_uint16_to_int16(uint16_t x) { - unsigned j; - for (j = 0; j < MLKEM_N / 4; j++) - __loop__( - invariant(j <= MLKEM_N / 4) - invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[4]; - uint8_t const *base = &a[5 * j]; - - t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); - t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); - t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); - - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) - { - r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + /* + * PORTABILITY: This relies on uint16_t -> int16_t + * being implemented as the inverse of int16_t -> uint16_t, + * which is implementation-defined (C99 6.3.1.3 (3)) + * CBMC (correctly) fails to prove this conversion is OK, + * so we have to suppress that check here + */ + return (int16_t)x; } -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ +#ifdef CBMC +#pragma CPROVER check pop +#endif -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +/************************************************* + * Name: montgomery_reduce_generic + * + * Description: Generic Montgomery reduction; given a 32-bit integer a, computes + * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 + * + * Arguments: - int32_t a: input integer to be reduced + * + * Returns: integer congruent to a * R^-1 modulo q, with absolute value + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 + * + **************************************************/ +ALWAYS_INLINE +static INLINE int16_t montgomery_reduce_generic(int32_t a) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ + const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 32))) - { - t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); - } + /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ + const uint16_t a_reduced = a & UINT16_MAX; + const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC, and use array indexing into - * r rather than pointer-arithmetic to simplify verification - */ - r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); - r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); - r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); - r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); - r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); - } -} + /* Lift to signed canonical representative mod 2^16. */ + const int16_t t = cast_uint16_to_int16(a_inverted); -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + int32_t r = a - ((int32_t)t * MLKEM_Q); + /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - for (j = 0; j < MLKEM_N / 8; j++) - __loop__(invariant(j <= MLKEM_N / 8)) - { - unsigned k; - uint16_t t[8]; - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(forall(r, 0, k, t[r] < (1u << 11)))) - { - t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); - } + /* + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + r = r >> 16; + /* Bounds: |r >> 16| <= ceil(|r| / 2^16) + * <= ceil(|a| / 2^16 + MLKEM_Q / 2) + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * + * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) + */ - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 11-bit in size. - */ - r[11 * j + 0] = (t[0] >> 0) & 0xFF; - r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); - r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); - r[11 * j + 3] = (t[2] >> 2) & 0xFF; - r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); - r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); - r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); - r[11 * j + 7] = (t[5] >> 1) & 0xFF; - r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); - r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); - r[11 * j + 10] = (t[7] >> 3); - } + return (int16_t)r; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +/************************************************* + * Name: montgomery_reduce + * + * Description: Montgomery reduction + * + * Arguments: - int32_t a: input integer to be reduced + * Must be smaller than 2 * 2^12 * 2^15 in absolute value. + * + * Returns: integer congruent to a * R^-1 modulo q, + * smaller than 2 * q in absolute value. + **************************************************/ +static INLINE int16_t montgomery_reduce(int32_t a) +__contract__( + requires(a > -(2 * UINT12_LIMIT * 32768)) + requires(a < (2 * UINT12_LIMIT * 32768)) + ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) +) { - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - uint8_t t[8]; - const unsigned offset = i * 5; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC and unwind loop for ease - * of proof. - */ - - /* - * Decompress 5 8-bit bytes (so 40 bits) into - * 8 5-bit values stored in t[] - */ - t[0] = 0x1F & (a[offset + 0] >> 0); - t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); - t[2] = 0x1F & (a[offset + 1] >> 2); - t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); - t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); - t[5] = 0x1F & (a[offset + 3] >> 1); - t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); - t[7] = 0x1F & (a[offset + 4] >> 3); - - /* and copy to the correct slice in r[] */ - for (j = 0; j < 8; j++) - __loop__( - invariant(j <= 8 && i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + int16_t res; + debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); + + res = montgomery_reduce_generic(a); + /* Bounds: + * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 + * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 + * < 2 * MLKEM_Q */ + + debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); + return res; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: fqmul + * + * Description: Montgomery multiplication modulo q=3329 + * + * Arguments: - int16_t a: first factor + * Can be any int16_t. + * - int16_t b: second factor. + * Must be signed canonical (abs value <(q+1)/2) + * + * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and + * smaller than q in absolute value. + * + **************************************************/ +static INLINE int16_t fqmul(int16_t a, int16_t b) +__contract__( + requires(b > -HALF_Q) + requires(b < HALF_Q) + ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) +) { - unsigned j; - for (j = 0; j < MLKEM_N / 8; j++) - __loop__( - invariant(j <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[8]; - uint8_t const *base = &a[11 * j]; - t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); - t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | - ((uint16_t)base[4] << 10)); - t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); - t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); - t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | - ((uint16_t)base[8] << 9)); - t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); - t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); - - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) - { - r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); - } - } + int16_t res; + debug_assert_abs_bound(&b, 1, HALF_Q); + + res = montgomery_reduce((int32_t)a * (int32_t)b); + /* Bounds: + * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 + * < MLKEM_Q + */ - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + debug_assert_abs_bound(&res, 1, MLKEM_Q); + return res; } -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ - -#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: barrett_reduce + * + * Description: Barrett reduction; given a 16-bit integer a, computes + * centered representative congruent to a mod q in + * {-(q-1)/2,...,(q-1)/2} + * + * Arguments: - int16_t a: input integer to be reduced + * + * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. + **************************************************/ +static INLINE int16_t barrett_reduce(int16_t a) +__contract__( + ensures(return_value > -HALF_Q && return_value < HALF_Q) +) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 2; i++) - __loop__(invariant(i <= MLKEM_N / 2)) - { - const uint16_t t0 = a->coeffs[2 * i]; - const uint16_t t1 = a->coeffs[2 * i + 1]; - /* - * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of - * significant data, so these can be packed into 24 bits or exactly - * 3 bytes, as follows. - */ - - /* Least significant bits 0 - 7 of t0. */ - r[3 * i + 0] = t0 & 0xFF; - - /* - * Most significant bits 8 - 11 of t0 become the least significant - * nibble of the second byte. The least significant 4 bits - * of t1 become the upper nibble of the second byte. - */ - r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + /* + * To divide by MLKEM_Q using Barrett multiplication, the "magic number" + * multiplier is round_to_nearest(2**26/MLKEM_Q) + */ + const int BPOWER = 26; + const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - /* Bits 4 - 11 of t1 become the third byte. */ - r[3 * i + 2] = t1 >> 4; - } -} -#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -{ - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - poly_tobytes_native(r, a); -} -#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + /* + * Compute round_to_nearest(a/MLKEM_Q) using the multiplier + * above and shift by BPOWER places. + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; -#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) - { - const uint8_t t0 = a[3 * i + 0]; - const uint8_t t1 = a[3 * i + 1]; - const uint8_t t2 = a[3 * i + 2]; - r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); - r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); - } + /* + * t is in -10 .. +10, so we need 32-bit math to + * evaluate t * MLKEM_Q and the subsequent subtraction + */ + int16_t res = (int16_t)(a - t * MLKEM_Q); - /* Note that the coefficients are not canonical */ - debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); -} -#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - poly_frombytes_native(r, a); + debug_assert_abs_bound(&res, 1, HALF_Q); + return res; } -#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ - -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], + int16_t b_cached) +__contract__( + requires(memory_no_alias(r, 2 * sizeof(int16_t))) + requires(memory_no_alias(a, 2 * sizeof(int16_t))) + requires(memory_no_alias(b, 2 * sizeof(int16_t))) + requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) + assigns(memory_slice(r, 2 * sizeof(int16_t))) + ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))) { - unsigned i; -#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) -#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" -#endif + int32_t t0, t1; + debug_assert_bound(a, 2, 0, UINT12_LIMIT); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - for (j = 0; j < 8; j++) - __loop__( - invariant(i < MLKEM_N / 8 && j <= 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - /* Prevent the compiler from recognizing this as a bit selection */ - uint8_t mask = value_barrier_u8(1u << j); - r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); - } - } - debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); -} + t0 = (int32_t)a[1] * b_cached; + t0 += (int32_t)a[0] * b[0]; + t1 = (int32_t)a[0] * b[1]; + t1 += (int32_t)a[1] * b[0]; -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* |ti| < 2 * q * 2^15 */ + r[0] = montgomery_reduce(t0); + r[1] = montgomery_reduce(t1); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - msg[i] = 0; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8)) - { - uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); - msg[i] |= t << j; - } - } + debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); } MLKEM_NATIVE_INTERNAL_API @@ -434,12 +292,46 @@ void poly_tomont(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_tomont(poly *r) { - poly_tomont_native(r); + poly_tomont_native(r->coeffs); debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) +/************************************************************ + * Name: scalar_signed_to_unsigned_q + * + * Description: converts signed polynomial coefficient + * from signed (-3328 .. 3328) form to + * unsigned form (0 .. 3328). + * + * Note: Cryptographic constant time implementation + * + * Examples: 0 -> 0 + * 1 -> 1 + * 3328 -> 3328 + * -1 -> 3328 + * -2 -> 3327 + * -3328 -> 1 + * + * Arguments: c: signed coefficient to be converted + ************************************************************/ +static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) +__contract__( + requires(c > -MLKEM_Q && c < MLKEM_Q) + ensures(return_value >= 0 && return_value < MLKEM_Q) + ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) +{ + debug_assert_abs_bound(&c, 1, MLKEM_Q); + + /* Add Q if c is negative, but in constant time */ + c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); + + /* and therefore cast to uint16_t is safe. */ + debug_assert_bound(&c, 1, 0, MLKEM_Q); + return (uint16_t)c; +} + MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { @@ -461,7 +353,7 @@ void poly_reduce(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { - poly_reduce_native(r); + poly_reduce_native(r->coeffs); debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ @@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a) MLKEM_NATIVE_INTERNAL_API void poly_mulcache_compute(poly_mulcache *x, const poly *a) { - poly_mulcache_compute_native(x, a); + poly_mulcache_compute_native(x->coeffs, a->coeffs); /* Omitting bounds assertion since native implementations may * decide not to use a mulcache. Note that the C backend implementation * of poly_basemul_montgomery_cached() does still include the check. */ } #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ +#if !defined(MLKEM_USE_NATIVE_NTT) +/* + * Computes a block CT butterflies with a fixed twiddle factor, + * using Montgomery multiplication. + * Parameters: + * - r: Pointer to base of polynomial (_not_ the base of butterfly block) + * - root: Twiddle factor to use for the butterfly. This must be in + * Montgomery form and signed canonical. + * - start: Offset to the beginning of the butterfly block + * - len: Index difference between coefficients subject to a butterfly + * - bound: Ghost variable describing coefficient bound: Prior to `start`, + * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, + * they must be bound by `bound`. + * When this function returns, output coefficients in the index range + * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. + * Example: + * - start=8, len=4 + * This would compute the following four butterflies + * 8 -- 12 + * 9 -- 13 + * 10 -- 14 + * 11 -- 15 + * - start=4, len=2 + * This would compute the following two butterflies + * 4 -- 6 + * 5 -- 7 + */ +static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, + unsigned start, unsigned len, int bound) +__contract__( + requires(start < MLKEM_N) + requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) + requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) + requires(-HALF_Q < zeta && zeta < HALF_Q) + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) + requires(array_abs_bound(r, start, MLKEM_N, bound)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) + ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) +{ + /* `bound` is a ghost variable only needed in the CBMC specification */ + unsigned j; + ((void)bound); + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + /* + * Coefficients are updated in strided pairs, so the bounds for the + * intermediate states alternate twice between the old and new bound + */ + invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j, start + len, bound)) + invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) + { + int16_t t; + t = fqmul(r[j + len], zeta); + r[j + len] = r[j] - t; + r[j] = r[j] + t; + } +} + +/* + *Compute one layer of forward NTT + * Parameters: + * - r: Pointer to base of polynomial + * - len: Stride of butterflies in this layer. + * - layer: Ghost variable indicating which layer is being applied. + * Must match `len` via `len == MLKEM_N >> layer`. + * Note: `len` could be dropped and computed in the function, but + * we are following the structure of the reference NTT from the + * official Kyber implementation here, merely adding `layer` as + * a ghost variable for the specifications. + */ +static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) + requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable only needed in the CBMC specification */ + ((void)layer); + /* Twiddle factors for layer n start at index 2^(layer-1) */ + k = MLKEM_N / (2 * len); + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(start < MLKEM_N + 2 * len) + invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) + invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) + invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) + { + int16_t zeta = zetas[k++]; + ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); + } +} + +/* + * Compute full forward NTT + * NOTE: This particular implementation satisfies a much tighter + * bound on the output coefficients (5*q) than the contractual one (8*q), + * but this is not needed in the calling code. Should we change the + * base multiplication strategy to require smaller NTT output bounds, + * the proof may need strengthening. + */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + unsigned len, layer; + int16_t *r; + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + r = p->coeffs; + + for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) + __loop__( + invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) + invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) + { + ntt_layer(r, len, layer); + } + + /* Check the stronger bound */ + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_NTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + ntt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_NTT */ + +#if !defined(MLKEM_USE_NATIVE_INTT) + +/* Compute one layer of inverse NTT */ +static void invntt_layer(int16_t *r, unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) + requires(len == (1 << (8 - layer))) + requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable used only in the specification */ + ((void)layer); + k = MLKEM_N / len - 1; + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + invariant(start <= MLKEM_N && k <= 127) + /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ + invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) + { + unsigned j; + int16_t zeta = zetas[k--]; + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + invariant(start <= MLKEM_N && k <= 127) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + int16_t t = r[j]; + r[j] = barrett_reduce(t + r[j + len]); + r[j + len] = r[j + len] - t; + r[j + len] = fqmul(r[j + len], zeta); + } + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + /* + * Scale input polynomial to account for Montgomery factor + * and NTT twist. This also brings coefficients down to + * absolute value < MLKEM_Q. + */ + unsigned j, len, layer; + const int16_t f = 1441; + int16_t *r = p->coeffs; + + for (j = 0; j < MLKEM_N; j++) + __loop__( + invariant(j <= MLKEM_N) + invariant(array_abs_bound(r, 0, j, MLKEM_Q))) + { + r[j] = fqmul(r[j], f); + } + + /* Run the invNTT layers */ + for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) + __loop__( + invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + invntt_layer(p->coeffs, len, layer); + } + + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_INTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + intt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_INTT */ + #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h index 6a14c785d..cb0d67c1a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h @@ -9,7 +9,7 @@ #include #include "cbmc.h" #include "common.h" -#include "reduce.h" +#include "debug.h" #include "verify.h" /* Absolute exclusive upper bound for the output of the inverse NTT */ @@ -18,6 +18,9 @@ /* Absolute exclusive upper bound for the output of the forward NTT */ #define NTT_BOUND (8 * MLKEM_Q) +#define zetas MLKEM_NAMESPACE(zetas) +extern const int16_t zetas[128]; + /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1] @@ -38,520 +41,6 @@ typedef struct int16_t coeffs[MLKEM_N >> 1]; } poly_mulcache; -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) -#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) -#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) -#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) -#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) -#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) -#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) -#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) -#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) -#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) -/* End of static namespacing */ - -/************************************************************ - * Name: scalar_compress_d1 - * - * Description: Computes round(u * 2 / q) - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 1. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d1(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 2) - ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) -{ - uint32_t d0 = u << 1; - d0 *= 645083; - d0 += 1u << 30; - d0 >>= 31; - return d0; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_compress_d4 - * - * Description: Computes round(u * 16 / q) % 16 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d4(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 16) - ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) -{ - uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ - return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d4 - * - * Description: Computes round(u * q / 16) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d4(uint32_t u) -__contract__( - requires(0 <= u && u < 16) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 8) / 16; } - -/************************************************************ - * Name: scalar_compress_d5 - * - * Description: Computes round(u * 32 / q) % 32 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d5(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 32) - ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) -{ - uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ - return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d5 - * - * Description: Computes round(u * q / 32) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo 32 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d5(uint32_t u) -__contract__( - requires(0 <= u && u < 32) - ensures(return_value <= MLKEM_Q - 1) -) { return ((u * MLKEM_Q) + 16) / 32; } - -/************************************************************ - * Name: scalar_compress_d10 - * - * Description: Computes round(u * 2**10 / q) % 2**10 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d10(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 10)) - ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) -{ - uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x3FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d10 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d10(uint32_t u) -__contract__( - requires(0 <= u && u < 1024) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 512) / 1024; } - -/************************************************************ - * Name: scalar_compress_d11 - * - * Description: Computes round(u * 2**11 / q) % 2**11 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 11. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d11(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 11)) - ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) -{ - uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x7FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d11 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d11(uint32_t u) -__contract__( - requires(0 <= u && u < 2048) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 1024) / 2048; } - -/************************************************************ - * Name: scalar_signed_to_unsigned_q - * - * Description: converts signed polynomial coefficient - * from signed (-3328 .. 3328) form to - * unsigned form (0 .. 3328). - * - * Note: Cryptographic constant time implementation - * - * Examples: 0 -> 0 - * 1 -> 1 - * 3328 -> 3328 - * -1 -> 3328 - * -2 -> 3327 - * -3328 -> 1 - * - * Arguments: c: signed coefficient to be converted - ************************************************************/ -static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) -__contract__( - requires(c > -MLKEM_Q && c < MLKEM_Q) - ensures(return_value >= 0 && return_value < MLKEM_Q) - ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) -{ - debug_assert_abs_bound(&c, 1, MLKEM_Q); - - /* Add Q if c is negative, but in constant time */ - c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); - - /* and therefore cast to uint16_t is safe. */ - debug_assert_bound(&c, 1, 0, MLKEM_Q); - return (uint16_t)c; -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ - (MLKEM_K == 2 || MLKEM_K == 3) -#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) -/************************************************* - * Name: poly_compress_d4 - * - * Description: Compression (4 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); - -#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) -/************************************************* - * Name: poly_compress_d10 - * - * Description: Compression (10 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); - -#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) -/************************************************* - * Name: poly_decompress_d4 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); - -#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) -/************************************************* - * Name: poly_decompress_d10 - * - * Description: De-serialization and subsequent decompression (10 bits) of a - * polynomial; approximate inverse of poly_compress_d10 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) -/************************************************* - * Name: poly_compress_d5 - * - * Description: Compression (5 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); - -#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) -/************************************************* - * Name: poly_compress_d11 - * - * Description: Compression (11 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); - -#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) -/************************************************* - * Name: poly_decompress_d5 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); - -#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) -/************************************************* - * Name: poly_decompress_d11 - * - * Description: De-serialization and subsequent decompression (11 bits) of a - * polynomial; approximate inverse of poly_compress_d11 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ - */ - -#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) -/************************************************* - * Name: poly_tobytes - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - a: const pointer to input polynomial, - * with each coefficient in the range [0,1,..,Q-1] - * OUTPUT - * - r: pointer to output byte array - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYBYTES)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(r)) -); - - -#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) -/************************************************* - * Name: poly_frombytes - * - * Description: De-serialization of a polynomial. - * - * Arguments: INPUT - * - a: pointer to input byte array - * (of MLKEM_POLYBYTES bytes) - * OUTPUT - * - r: pointer to output polynomial, with - * each coefficient unsigned and in the range - * 0 .. 4095 - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) -); - - -#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) -/************************************************* - * Name: poly_frommsg - * - * Description: Convert 32-byte message to polynomial - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *msg: pointer to input message - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(object_whole(r)) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) -); - -#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) -/************************************************* - * Name: poly_tomsg - * - * Description: Convert polynomial to 32-byte message - * - * Arguments: - uint8_t *msg: pointer to output message - * - const poly *r: pointer to input polynomial - * Coefficients must be unsigned canonical - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(msg)) -); - #define poly_basemul_montgomery_cached \ MLKEM_NAMESPACE(poly_basemul_montgomery_cached) /************************************************* @@ -715,4 +204,56 @@ __contract__( assigns(object_whole(r)) ); +#define poly_ntt MLKEM_NAMESPACE(poly_ntt) +/************************************************* + * Name: poly_ntt + * + * Description: Computes negacyclic number-theoretic transform (NTT) of + * a polynomial in place. + * + * The input is assumed to be in normal order and + * coefficient-wise bound by MLKEM_Q in absolute value. + * + * The output polynomial is in bitreversed order, and + * coefficient-wise bound by NTT_BOUND in absolute value. + * + * (NOTE: Sometimes the input to the NTT is actually smaller, + * which gives better bounds.) + * + * Arguments: - poly *p: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) +); + +#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) +/************************************************* + * Name: poly_invntt_tomont + * + * Description: Computes inverse of negacyclic number-theoretic transform (NTT) + * of a polynomial in place; + * inputs assumed to be in bitreversed order, output in normal + * order + * + * The input is assumed to be in bitreversed order, and can + * have arbitrary coefficients in int16_t. + * + * The output polynomial is in normal order, and + * coefficient-wise bound by INVNTT_BOUND in absolute value. + * + * Arguments: - uint16_t *a: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) +); + #endif /* POLY_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly_k.c new file mode 100644 index 000000000..c2d330ea9 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly_k.c @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "poly_k.h" +#include +#include +#include "arith_backend.h" +#include "compress.h" +#include "sampling.h" +#include "symmetric.h" + +#include "debug.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1) +#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2) +/* End of static namespacing */ + +MLKEM_NATIVE_INTERNAL_API +void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], + const polyvec *a) +{ + unsigned i; + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_K; i++) + { + poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_decompress_du(polyvec *r, + const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU); + } + + debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) +{ + unsigned i; + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_K; i++) + { + poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES); + } + + debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_ntt(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_ntt(&r->vec[i]); + } + + debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_invntt_tomont(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_invntt_tomont(&r->vec[i]); + } + + debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND); +} + +#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, + const polyvec *b, + const polyvec_mulcache *b_cache) +{ + unsigned i; + poly t; + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); + + poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]); + for (i = 1; i < MLKEM_K; i++) + { + poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i], + &b_cache->vec[i]); + poly_add(r, &t); + } + + /* + * This bound is true for the C implementation, but not needed + * in the higher level bounds reasoning. It is thus omitted + * them from the spec to not unnecessarily constrain native + * implementations, but checked here nonetheless. + */ + debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q); +} +#else /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, + const polyvec *b, + const polyvec_mulcache *b_cache) +{ + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); + /* Omitting bounds assertion for cache since native implementations may + * decide not to use a mulcache. Note that the C backend implementation + * of poly_basemul_montgomery_cached() does still include the check. */ + polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a, + (const int16_t *)b, + (const int16_t *)b_cache); +} +#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ + +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) +{ + polyvec_mulcache b_cache; + polyvec_mulcache_compute(&b_cache, b); + polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_mulcache_compute(&x->vec[i], &a->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_reduce(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_reduce(&r->vec[i]); + } + + debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_add(polyvec *r, const polyvec *b) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_add(&r->vec[i], &b->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_tomont(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_tomont(&r->vec[i]); + } + + debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q); +} + + +/************************************************* + * Name: poly_cbd_eta1 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter MLKEM_ETA1. + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +static INLINE void poly_cbd_eta1(poly *r, + const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4]) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1)) +) +{ +#if MLKEM_ETA1 == 2 + poly_cbd2(r, buf); +#elif MLKEM_ETA1 == 3 + poly_cbd3(r, buf); +#else +#error "Invalid value of MLKEM_ETA1" +#endif +} + +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, + uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) +{ + ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1]; + ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1]; + ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1]; + ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1]; + memcpy(extkey0, seed, MLKEM_SYMBYTES); + memcpy(extkey1, seed, MLKEM_SYMBYTES); + memcpy(extkey2, seed, MLKEM_SYMBYTES); + memcpy(extkey3, seed, MLKEM_SYMBYTES); + extkey0[MLKEM_SYMBYTES] = nonce0; + extkey1[MLKEM_SYMBYTES] = nonce1; + extkey2[MLKEM_SYMBYTES] = nonce2; + extkey3[MLKEM_SYMBYTES] = nonce3; + prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3); + poly_cbd_eta1(r0, buf0); + poly_cbd_eta1(r1, buf1); + poly_cbd_eta1(r2, buf2); + poly_cbd_eta1(r3, buf3); + + debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1); +} + +#if MLKEM_K == 2 || MLKEM_K == 4 +/************************************************* + * Name: poly_cbd_eta2 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter MLKEM_ETA2. + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +static INLINE void poly_cbd_eta2(poly *r, + const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))) +{ +#if MLKEM_ETA2 == 2 + poly_cbd2(r, buf); +#else +#error "Invalid value of MLKEM_ETA2" +#endif +} + +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce) +{ + ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]; + ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1]; + + memcpy(extkey, seed, MLKEM_SYMBYTES); + extkey[MLKEM_SYMBYTES] = nonce; + prf_eta2(buf, extkey); + + poly_cbd_eta2(r, buf); + + debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1); +} +#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ + + +#if MLKEM_K == 2 +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, + uint8_t nonce3) +{ + ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4]; + ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1]; + memcpy(extkey[0], seed, MLKEM_SYMBYTES); + memcpy(extkey[1], seed, MLKEM_SYMBYTES); + memcpy(extkey[2], seed, MLKEM_SYMBYTES); + memcpy(extkey[3], seed, MLKEM_SYMBYTES); + extkey[0][MLKEM_SYMBYTES] = nonce0; + extkey[1][MLKEM_SYMBYTES] = nonce1; + extkey[2][MLKEM_SYMBYTES] = nonce2; + extkey[3][MLKEM_SYMBYTES] = nonce3; + + prf_eta1(buf1[0], extkey[0]); + prf_eta1(buf1[1], extkey[1]); + prf_eta2(buf2[0], extkey[2]); + prf_eta2(buf2[1], extkey[3]); + + poly_cbd_eta1(r0, buf1[0]); + poly_cbd_eta1(r1, buf1[1]); + poly_cbd_eta2(r2, buf2[0]); + poly_cbd_eta2(r3, buf2[1]); + + debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1); + debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1); +} +#endif /* MLKEM_K == 2 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly_k.h new file mode 100644 index 000000000..0aea95912 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly_k.h @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef POLY_K_H +#define POLY_K_H + +#include +#include "common.h" +#include "compress.h" +#include "poly.h" + +#define polyvec MLKEM_NAMESPACE_K(polyvec) +typedef struct +{ + poly vec[MLKEM_K]; +} ALIGN polyvec; + +#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache) +typedef struct +{ + poly_mulcache vec[MLKEM_K]; +} polyvec_mulcache; + +#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du) +/************************************************* + * Name: poly_compress_du + * + * Description: Compression (du bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], + const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))) +{ +#if MLKEM_DU == 10 + poly_compress_d10(r, a); +#elif MLKEM_DU == 11 + poly_compress_d11(r, a); +#else +#error "Invalid value of MLKEM_DU" +#endif +} + +#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du) +/************************************************* + * Name: poly_decompress_du + * + * Description: De-serialization and subsequent decompression (du bits) of a + * polynomial; approximate inverse of poly_compress_du + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +static INLINE void poly_decompress_du( + poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +{ +#if MLKEM_DU == 10 + poly_decompress_d10(r, a); +#elif MLKEM_DU == 11 + poly_decompress_d11(r, a); +#else +#error "Invalid value of MLKEM_DU" +#endif +} + +#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv) +/************************************************* + * Name: poly_compress_dv + * + * Description: Compression (dv bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], + const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(r))) +{ +#if MLKEM_DV == 4 + poly_compress_d4(r, a); +#elif MLKEM_DV == 5 + poly_compress_d5(r, a); +#else +#error "Invalid value of MLKEM_DV" +#endif +} + + +#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv) +/************************************************* + * Name: poly_decompress_dv + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +static INLINE void poly_decompress_dv( + poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(object_whole(r)) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +{ +#if MLKEM_DV == 4 + poly_decompress_d4(r, a); +#elif MLKEM_DV == 5 + poly_decompress_d5(r, a); +#else +#error "Invalid value of MLKEM_DV" +#endif +} + +#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du) +/************************************************* + * Name: polyvec_compress_du + * + * Description: Compress and serialize vector of polynomials + * + * Arguments: - uint8_t *r: pointer to output byte array + * (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU) + * - const polyvec *a: pointer to input vector of polynomials. + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], + const polyvec *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(a, sizeof(polyvec))) + requires(forall(k0, 0, MLKEM_K, + array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) + assigns(object_whole(r)) +); + +#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du) +/************************************************* + * Name: polyvec_decompress_du + * + * Description: De-serialize and decompress vector of polynomials; + * approximate inverse of polyvec_compress_du + * + * Arguments: - polyvec *r: pointer to output vector of polynomials. + * Output will have coefficients normalized to [0,..,q-1]. + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU) + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_decompress_du(polyvec *r, + const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(k0, 0, MLKEM_K, + array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +); + +#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes) +/************************************************* + * Name: polyvec_tobytes + * + * Description: Serialize vector of polynomials + * + * Arguments: - uint8_t *r: pointer to output byte array + * (needs space for MLKEM_POLYVECBYTES) + * - const polyvec *a: pointer to input vector of polynomials + * Each polynomial must have coefficients in [0,..,q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) +__contract__( + requires(memory_no_alias(a, sizeof(polyvec))) + requires(memory_no_alias(r, MLKEM_POLYVECBYTES)) + requires(forall(k0, 0, MLKEM_K, + array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) + assigns(object_whole(r)) +); + +#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes) +/************************************************* + * Name: polyvec_frombytes + * + * Description: De-serialize vector of polynomials; + * inverse of polyvec_tobytes + * + * Arguments: - const polyvec *a: pointer to output vector of polynomials + * (of length MLKEM_POLYVECBYTES). Output will have coefficients + * normalized in [0..4095]. + * - uint8_t *r: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + requires(memory_no_alias(a, MLKEM_POLYVECBYTES)) + assigns(object_whole(r)) + ensures(forall(k0, 0, MLKEM_K, + array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) +); + +#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt) +/************************************************* + * Name: polyvec_ntt + * + * Description: Apply forward NTT to all elements of a vector of polynomials. + * + * The input is assumed to be in normal order and + * coefficient-wise bound by MLKEM_Q in absolute value. + * + * The output polynomial is in bitreversed order, and + * coefficient-wise bound by NTT_BOUND in absolute value. + * + * Arguments: - polyvec *r: pointer to in/output vector of polynomials + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_ntt(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + requires(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) + assigns(object_whole(r)) + ensures(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND))) +); + +#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont) +/************************************************* + * Name: polyvec_invntt_tomont + * + * Description: Apply inverse NTT to all elements of a vector of polynomials + * and multiply by Montgomery factor 2^16 + * + * The input is assumed to be in bitreversed order, and can + * have arbitrary coefficients in int16_t. + * + * The output polynomial is in normal order, and + * coefficient-wise bound by INVNTT_BOUND in absolute value. + * + * + * Arguments: - polyvec *r: pointer to in/output vector of polynomials + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_invntt_tomont(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND))) +); + +#define polyvec_basemul_acc_montgomery \ + MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery) +/************************************************* + * Name: polyvec_basemul_acc_montgomery + * + * Description: Multiply elements of a and b in NTT domain, accumulate into r, + * and multiply by 2^-16. + * + * Arguments: - poly *r: pointer to output polynomial + * - const polyvec *a: pointer to first input vector of polynomials + * - const polyvec *b: pointer to second input vector of polynomials + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(a, sizeof(polyvec))) + requires(memory_no_alias(b, sizeof(polyvec))) + requires(forall(k1, 0, MLKEM_K, + array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) + assigns(memory_slice(r, sizeof(poly))) +); + + +#define polyvec_basemul_acc_montgomery_cached \ + MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached) +/************************************************* + * Name: polyvec_basemul_acc_montgomery_cached + * + * Description: Scalar product of two vectors of polynomials in NTT domain, + * using mulcache for second operand. + * + * Bounds: + * - Every coefficient of a is assumed to be in [0..4095] + * - No bounds guarantees for the coefficients in the result. + * + * Arguments: - poly *r: pointer to output polynomial + * - const polyvec *a: pointer to first input polynomial vector + * - const polyvec *b: pointer to second input polynomial vector + * - const polyvec_mulcache *b_cache: pointer to mulcache + * for second input polynomial vector. Can be computed + * via polyvec_mulcache_compute(). + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, + const polyvec *b, + const polyvec_mulcache *b_cache) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(a, sizeof(polyvec))) + requires(memory_no_alias(b, sizeof(polyvec))) + requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache))) + requires(forall(k1, 0, MLKEM_K, + array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) + assigns(memory_slice(r, sizeof(poly))) +); + +#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute) +/************************************************************ + * Name: polyvec_mulcache_compute + * + * Description: Computes the mulcache for a vector of polynomials in NTT domain + * + * The mulcache of a degree-2 polynomial b := b0 + b1*X + * in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when + * computing products of b in Fq[X]/(X^2-zeta). + * + * The mulcache of a polynomial in NTT domain -- which is + * a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta), + * for varying zeta, is the 128-tuple of mulcaches of those + * polynomials. + * + * The mulcache of a vector of polynomials is the vector + * of mulcaches of its entries. + * + * Arguments: - x: Pointer to mulcache to be populated + * - a: Pointer to input polynomial vector + ************************************************************/ +/* + * NOTE: The default C implementation of this function populates + * the mulcache with values in (-q,q), but this is not needed for the + * higher level safety proofs, and thus not part of the spec. + */ +MLKEM_NATIVE_INTERNAL_API +void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) +__contract__( + requires(memory_no_alias(x, sizeof(polyvec_mulcache))) + requires(memory_no_alias(a, sizeof(polyvec))) + assigns(object_whole(x)) +); + +#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce) +/************************************************* + * Name: polyvec_reduce + * + * Description: Applies Barrett reduction to each coefficient + * of each element of a vector of polynomials; + * for details of the Barrett reduction see comments in reduce.c + * + * Arguments: - polyvec *r: pointer to input/output polynomial + **************************************************/ +/* + * NOTE: The semantics of polyvec_reduce() is different in + * the reference implementation, which requires + * signed canonical output data. Unsigned canonical + * outputs are better suited to the only remaining + * use of poly_reduce() in the context of (de)serialization. + */ +MLKEM_NATIVE_INTERNAL_API +void polyvec_reduce(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(k0, 0, MLKEM_K, + array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +); + +#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add) +/************************************************* + * Name: polyvec_add + * + * Description: Add vectors of polynomials + * + * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be + * added to + * - const polyvec *b: pointer to second input vector of polynomials + * + * The coefficients of r and b must be so that the addition does + * not overflow. Otherwise, the behaviour of this function is undefined. + * + * The coefficients returned in *r are in int16_t which is sufficient + * to prove type-safety of calling units. Therefore, no stronger + * ensures clause is required on this function. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_add(polyvec *r, const polyvec *b) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + requires(memory_no_alias(b, sizeof(polyvec))) + requires(forall(j0, 0, MLKEM_K, + forall(k0, 0, MLKEM_N, + (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX))) + requires(forall(j1, 0, MLKEM_K, + forall(k1, 0, MLKEM_N, + (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN))) + assigns(object_whole(r)) +); + +#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont) +/************************************************* + * Name: polyvec_tomont + * + * Description: Inplace conversion of all coefficients of a polynomial + * vector from normal domain to Montgomery domain + * + * Bounds: Output < q in absolute value. + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_tomont(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(memory_slice(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) +); + +#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x) +/************************************************* + * Name: poly_getnoise_eta1_4x + * + * Description: Batch sample four polynomials deterministically from a seed + * and nonces, with output polynomials close to centered binomial distribution + * with parameter MLKEM_ETA1. + * + * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial + * - const uint8_t *seed: pointer to input seed + * (of length MLKEM_SYMBYTES bytes) + * - uint8_t nonce{0,1,2,3}: one-byte input nonce + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, + uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) +/* Depending on MLKEM_K, the pointers passed to this function belong + to the same objects, so we cannot use memory_no_alias for r0-r3. + + NOTE: Somehow it is important to use memory_no_alias() first in the + conjunctions defining each case. +*/ +#if MLKEM_K == 2 +__contract__( + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */ + (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && + r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) + assigns(memory_slice(r0, sizeof(poly))) + assigns(memory_slice(r1, sizeof(poly))) + assigns(memory_slice(r2, sizeof(poly))) + assigns(memory_slice(r3, sizeof(poly))) + ensures( + array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); +); +#elif MLKEM_K == 4 +__contract__( + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + requires( /* Case B: r0, r1, r2, r3 consecutive */ + (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3)) + assigns(memory_slice(r0, sizeof(poly))) + assigns(memory_slice(r1, sizeof(poly))) + assigns(memory_slice(r2, sizeof(poly))) + assigns(memory_slice(r3, sizeof(poly))) + ensures( + array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); +); +#elif MLKEM_K == 3 +__contract__( + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + requires( /* Case C: r0, r1, r2 consecutive */ + (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) && + r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0))) + assigns(memory_slice(r0, sizeof(poly))) + assigns(memory_slice(r1, sizeof(poly))) + assigns(memory_slice(r2, sizeof(poly))) + assigns(memory_slice(r3, sizeof(poly))) + ensures( + array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); +); +#endif /* MLKEM_K */ + +#if MLKEM_ETA1 == MLKEM_ETA2 +/* + * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024 + * where MLKEM_ETA2 = MLKEM_ETA1 = 2. + * For ml-kem-512, poly_getnoise_eta1122_4x is used instead. + */ +#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x +#endif /* MLKEM_ETA1 == MLKEM_ETA2 */ + +#if MLKEM_K == 2 || MLKEM_K == 4 +#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2) +/************************************************* + * Name: poly_getnoise_eta2 + * + * Description: Sample a polynomial deterministically from a seed and a nonce, + * with output polynomial close to centered binomial distribution + * with parameter MLKEM_ETA2 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *seed: pointer to input seed + * (of length MLKEM_SYMBYTES bytes) + * - uint8_t nonce: one-byte input nonce + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + assigns(object_whole(r)) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)) +); +#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ + +#if MLKEM_K == 2 +#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x) +/************************************************* + * Name: poly_getnoise_eta1122_4x + * + * Description: Batch sample four polynomials deterministically from a seed + * and a nonces, with output polynomials close to centered binomial + * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2 + * + * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial + * - const uint8_t *seed: pointer to input seed + * (of length MLKEM_SYMBYTES bytes) + * - uint8_t nonce{0,1,2,3}: one-byte input nonce + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, + uint8_t nonce3) +__contract__( + requires( /* r0, r1 consecutive, r2, r3 consecutive */ + (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && + r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3)) + ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)); +); +#endif /* MLKEM_K == 2 */ + +#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.c deleted file mode 100644 index 50ea1c34a..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.c +++ /dev/null @@ -1,330 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "polyvec.h" -#include -#include -#include "arith_backend.h" -#include "cbd.h" -#include "ntt.h" -#include "poly.h" -#include "symmetric.h" - -#include "debug.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1) -#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2) -/* End of static namespacing */ - -MLKEM_NATIVE_INTERNAL_API -void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], - const polyvec *a) -{ - unsigned i; - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_K; i++) - { - poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_decompress_du(polyvec *r, - const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU); - } - - debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) -{ - unsigned i; - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_K; i++) - { - poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES); - } - - debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_ntt(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_ntt(&r->vec[i]); - } - - debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_invntt_tomont(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_invntt_tomont(&r->vec[i]); - } - - debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND); -} - -#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) -{ - unsigned i; - poly t; - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); - - poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]); - for (i = 1; i < MLKEM_K; i++) - { - poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i], - &b_cache->vec[i]); - poly_add(r, &t); - } - - /* - * This bound is true for the C implementation, but not needed - * in the higher level bounds reasoning. It is thus omitted - * them from the spec to not unnecessarily constrain native - * implementations, but checked here nonetheless. - */ - debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q); -} -#else /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) -{ - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); - /* Omitting bounds assertion for cache since native implementations may - * decide not to use a mulcache. Note that the C backend implementation - * of poly_basemul_montgomery_cached() does still include the check. */ - polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache); -} -#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ - -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) -{ - polyvec_mulcache b_cache; - polyvec_mulcache_compute(&b_cache, b); - polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_mulcache_compute(&x->vec[i], &a->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_reduce(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_reduce(&r->vec[i]); - } - - debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_add(polyvec *r, const polyvec *b) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_add(&r->vec[i], &b->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_tomont(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_tomont(&r->vec[i]); - } - - debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q); -} - - -/************************************************* - * Name: poly_cbd_eta1 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter MLKEM_ETA1. - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -static INLINE void poly_cbd_eta1(poly *r, - const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4]) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1)) -) -{ -#if MLKEM_ETA1 == 2 - poly_cbd2(r, buf); -#elif MLKEM_ETA1 == 3 - poly_cbd3(r, buf); -#else -#error "Invalid value of MLKEM_ETA1" -#endif -} - -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, - uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) -{ - ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1]; - ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1]; - ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1]; - ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1]; - memcpy(extkey0, seed, MLKEM_SYMBYTES); - memcpy(extkey1, seed, MLKEM_SYMBYTES); - memcpy(extkey2, seed, MLKEM_SYMBYTES); - memcpy(extkey3, seed, MLKEM_SYMBYTES); - extkey0[MLKEM_SYMBYTES] = nonce0; - extkey1[MLKEM_SYMBYTES] = nonce1; - extkey2[MLKEM_SYMBYTES] = nonce2; - extkey3[MLKEM_SYMBYTES] = nonce3; - prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3); - poly_cbd_eta1(r0, buf0); - poly_cbd_eta1(r1, buf1); - poly_cbd_eta1(r2, buf2); - poly_cbd_eta1(r3, buf3); - - debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1); -} - -#if MLKEM_K == 2 || MLKEM_K == 4 -/************************************************* - * Name: poly_cbd_eta2 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter MLKEM_ETA2. - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -static INLINE void poly_cbd_eta2(poly *r, - const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))) -{ -#if MLKEM_ETA2 == 2 - poly_cbd2(r, buf); -#else -#error "Invalid value of MLKEM_ETA2" -#endif -} - -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce) -{ - ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]; - ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1]; - - memcpy(extkey, seed, MLKEM_SYMBYTES); - extkey[MLKEM_SYMBYTES] = nonce; - prf_eta2(buf, extkey); - - poly_cbd_eta2(r, buf); - - debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1); -} -#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ - - -#if MLKEM_K == 2 -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, - uint8_t nonce3) -{ - ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4]; - ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1]; - memcpy(extkey[0], seed, MLKEM_SYMBYTES); - memcpy(extkey[1], seed, MLKEM_SYMBYTES); - memcpy(extkey[2], seed, MLKEM_SYMBYTES); - memcpy(extkey[3], seed, MLKEM_SYMBYTES); - extkey[0][MLKEM_SYMBYTES] = nonce0; - extkey[1][MLKEM_SYMBYTES] = nonce1; - extkey[2][MLKEM_SYMBYTES] = nonce2; - extkey[3][MLKEM_SYMBYTES] = nonce3; - - prf_eta1(buf1[0], extkey[0]); - prf_eta1(buf1[1], extkey[1]); - prf_eta2(buf2[0], extkey[2]); - prf_eta2(buf2[1], extkey[3]); - - poly_cbd_eta1(r0, buf1[0]); - poly_cbd_eta1(r1, buf1[1]); - poly_cbd_eta2(r2, buf2[0]); - poly_cbd_eta2(r3, buf2[1]); - - debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1); - debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1); -} -#endif /* MLKEM_K == 2 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.h deleted file mode 100644 index 8be8579e0..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.h +++ /dev/null @@ -1,595 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef POLYVEC_H -#define POLYVEC_H - -#include -#include "common.h" -#include "poly.h" - -#define polyvec MLKEM_NAMESPACE_K(polyvec) -typedef struct -{ - poly vec[MLKEM_K]; -} ALIGN polyvec; - -#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache) -typedef struct -{ - poly_mulcache vec[MLKEM_K]; -} polyvec_mulcache; - -#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du) -/************************************************* - * Name: poly_compress_du - * - * Description: Compression (du bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], - const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))) -{ -#if MLKEM_DU == 10 - poly_compress_d10(r, a); -#elif MLKEM_DU == 11 - poly_compress_d11(r, a); -#else -#error "Invalid value of MLKEM_DU" -#endif -} - -#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du) -/************************************************* - * Name: poly_decompress_du - * - * Description: De-serialization and subsequent decompression (du bits) of a - * polynomial; approximate inverse of poly_compress_du - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -static INLINE void poly_decompress_du( - poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -{ -#if MLKEM_DU == 10 - poly_decompress_d10(r, a); -#elif MLKEM_DU == 11 - poly_decompress_d11(r, a); -#else -#error "Invalid value of MLKEM_DU" -#endif -} - -#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv) -/************************************************* - * Name: poly_compress_dv - * - * Description: Compression (dv bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], - const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(r))) -{ -#if MLKEM_DV == 4 - poly_compress_d4(r, a); -#elif MLKEM_DV == 5 - poly_compress_d5(r, a); -#else -#error "Invalid value of MLKEM_DV" -#endif -} - - -#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv) -/************************************************* - * Name: poly_decompress_dv - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -static INLINE void poly_decompress_dv( - poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(object_whole(r)) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -{ -#if MLKEM_DV == 4 - poly_decompress_d4(r, a); -#elif MLKEM_DV == 5 - poly_decompress_d5(r, a); -#else -#error "Invalid value of MLKEM_DV" -#endif -} - -#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du) -/************************************************* - * Name: polyvec_compress_du - * - * Description: Compress and serialize vector of polynomials - * - * Arguments: - uint8_t *r: pointer to output byte array - * (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU) - * - const polyvec *a: pointer to input vector of polynomials. - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], - const polyvec *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(a, sizeof(polyvec))) - requires(forall(k0, 0, MLKEM_K, - array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) - assigns(object_whole(r)) -); - -#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du) -/************************************************* - * Name: polyvec_decompress_du - * - * Description: De-serialize and decompress vector of polynomials; - * approximate inverse of polyvec_compress_du - * - * Arguments: - polyvec *r: pointer to output vector of polynomials. - * Output will have coefficients normalized to [0,..,q-1]. - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU) - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_decompress_du(polyvec *r, - const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(k0, 0, MLKEM_K, - array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -); - -#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes) -/************************************************* - * Name: polyvec_tobytes - * - * Description: Serialize vector of polynomials - * - * Arguments: - uint8_t *r: pointer to output byte array - * (needs space for MLKEM_POLYVECBYTES) - * - const polyvec *a: pointer to input vector of polynomials - * Each polynomial must have coefficients in [0,..,q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) -__contract__( - requires(memory_no_alias(a, sizeof(polyvec))) - requires(memory_no_alias(r, MLKEM_POLYVECBYTES)) - requires(forall(k0, 0, MLKEM_K, - array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) - assigns(object_whole(r)) -); - -#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes) -/************************************************* - * Name: polyvec_frombytes - * - * Description: De-serialize vector of polynomials; - * inverse of polyvec_tobytes - * - * Arguments: - const polyvec *a: pointer to output vector of polynomials - * (of length MLKEM_POLYVECBYTES). Output will have coefficients - * normalized in [0..4095]. - * - uint8_t *r: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - requires(memory_no_alias(a, MLKEM_POLYVECBYTES)) - assigns(object_whole(r)) - ensures(forall(k0, 0, MLKEM_K, - array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) -); - -#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt) -/************************************************* - * Name: polyvec_ntt - * - * Description: Apply forward NTT to all elements of a vector of polynomials. - * - * The input is assumed to be in normal order and - * coefficient-wise bound by MLKEM_Q in absolute value. - * - * The output polynomial is in bitreversed order, and - * coefficient-wise bound by NTT_BOUND in absolute value. - * - * Arguments: - polyvec *r: pointer to in/output vector of polynomials - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_ntt(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - requires(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) - assigns(object_whole(r)) - ensures(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND))) -); - -#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont) -/************************************************* - * Name: polyvec_invntt_tomont - * - * Description: Apply inverse NTT to all elements of a vector of polynomials - * and multiply by Montgomery factor 2^16 - * - * The input is assumed to be in bitreversed order, and can - * have arbitrary coefficients in int16_t. - * - * The output polynomial is in normal order, and - * coefficient-wise bound by INVNTT_BOUND in absolute value. - * - * - * Arguments: - polyvec *r: pointer to in/output vector of polynomials - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_invntt_tomont(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND))) -); - -#define polyvec_basemul_acc_montgomery \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery) -/************************************************* - * Name: polyvec_basemul_acc_montgomery - * - * Description: Multiply elements of a and b in NTT domain, accumulate into r, - * and multiply by 2^-16. - * - * Arguments: - poly *r: pointer to output polynomial - * - const polyvec *a: pointer to first input vector of polynomials - * - const polyvec *b: pointer to second input vector of polynomials - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(a, sizeof(polyvec))) - requires(memory_no_alias(b, sizeof(polyvec))) - requires(forall(k1, 0, MLKEM_K, - array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) - assigns(memory_slice(r, sizeof(poly))) -); - - -#define polyvec_basemul_acc_montgomery_cached \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached) -/************************************************* - * Name: polyvec_basemul_acc_montgomery_cached - * - * Description: Scalar product of two vectors of polynomials in NTT domain, - * using mulcache for second operand. - * - * Bounds: - * - Every coefficient of a is assumed to be in [0..4095] - * - No bounds guarantees for the coefficients in the result. - * - * Arguments: - poly *r: pointer to output polynomial - * - const polyvec *a: pointer to first input polynomial vector - * - const polyvec *b: pointer to second input polynomial vector - * - const polyvec_mulcache *b_cache: pointer to mulcache - * for second input polynomial vector. Can be computed - * via polyvec_mulcache_compute(). - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(a, sizeof(polyvec))) - requires(memory_no_alias(b, sizeof(polyvec))) - requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache))) - requires(forall(k1, 0, MLKEM_K, - array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) - assigns(memory_slice(r, sizeof(poly))) -); - -#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute) -/************************************************************ - * Name: polyvec_mulcache_compute - * - * Description: Computes the mulcache for a vector of polynomials in NTT domain - * - * The mulcache of a degree-2 polynomial b := b0 + b1*X - * in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when - * computing products of b in Fq[X]/(X^2-zeta). - * - * The mulcache of a polynomial in NTT domain -- which is - * a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta), - * for varying zeta, is the 128-tuple of mulcaches of those - * polynomials. - * - * The mulcache of a vector of polynomials is the vector - * of mulcaches of its entries. - * - * Arguments: - x: Pointer to mulcache to be populated - * - a: Pointer to input polynomial vector - ************************************************************/ -/* - * NOTE: The default C implementation of this function populates - * the mulcache with values in (-q,q), but this is not needed for the - * higher level safety proofs, and thus not part of the spec. - */ -MLKEM_NATIVE_INTERNAL_API -void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) -__contract__( - requires(memory_no_alias(x, sizeof(polyvec_mulcache))) - requires(memory_no_alias(a, sizeof(polyvec))) - assigns(object_whole(x)) -); - -#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce) -/************************************************* - * Name: polyvec_reduce - * - * Description: Applies Barrett reduction to each coefficient - * of each element of a vector of polynomials; - * for details of the Barrett reduction see comments in reduce.c - * - * Arguments: - polyvec *r: pointer to input/output polynomial - **************************************************/ -/* - * NOTE: The semantics of polyvec_reduce() is different in - * the reference implementation, which requires - * signed canonical output data. Unsigned canonical - * outputs are better suited to the only remaining - * use of poly_reduce() in the context of (de)serialization. - */ -MLKEM_NATIVE_INTERNAL_API -void polyvec_reduce(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(k0, 0, MLKEM_K, - array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -); - -#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add) -/************************************************* - * Name: polyvec_add - * - * Description: Add vectors of polynomials - * - * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be - * added to - * - const polyvec *b: pointer to second input vector of polynomials - * - * The coefficients of r and b must be so that the addition does - * not overflow. Otherwise, the behaviour of this function is undefined. - * - * The coefficients returned in *r are in int16_t which is sufficient - * to prove type-safety of calling units. Therefore, no stronger - * ensures clause is required on this function. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_add(polyvec *r, const polyvec *b) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - requires(memory_no_alias(b, sizeof(polyvec))) - requires(forall(j0, 0, MLKEM_K, - forall(k0, 0, MLKEM_N, - (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX))) - requires(forall(j1, 0, MLKEM_K, - forall(k1, 0, MLKEM_N, - (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN))) - assigns(object_whole(r)) -); - -#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont) -/************************************************* - * Name: polyvec_tomont - * - * Description: Inplace conversion of all coefficients of a polynomial - * vector from normal domain to Montgomery domain - * - * Bounds: Output < q in absolute value. - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_tomont(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(memory_slice(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) -); - -#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x) -/************************************************* - * Name: poly_getnoise_eta1_4x - * - * Description: Batch sample four polynomials deterministically from a seed - * and nonces, with output polynomials close to centered binomial distribution - * with parameter MLKEM_ETA1. - * - * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial - * - const uint8_t *seed: pointer to input seed - * (of length MLKEM_SYMBYTES bytes) - * - uint8_t nonce{0,1,2,3}: one-byte input nonce - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, - uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) -/* Depending on MLKEM_K, the pointers passed to this function belong - to the same objects, so we cannot use memory_no_alias for r0-r3. - - NOTE: Somehow it is important to use memory_no_alias() first in the - conjunctions defining each case. -*/ -#if MLKEM_K == 2 -__contract__( - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */ - (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && - r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) - assigns(memory_slice(r0, sizeof(poly))) - assigns(memory_slice(r1, sizeof(poly))) - assigns(memory_slice(r2, sizeof(poly))) - assigns(memory_slice(r3, sizeof(poly))) - ensures( - array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); -); -#elif MLKEM_K == 4 -__contract__( - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - requires( /* Case B: r0, r1, r2, r3 consecutive */ - (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3)) - assigns(memory_slice(r0, sizeof(poly))) - assigns(memory_slice(r1, sizeof(poly))) - assigns(memory_slice(r2, sizeof(poly))) - assigns(memory_slice(r3, sizeof(poly))) - ensures( - array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); -); -#elif MLKEM_K == 3 -__contract__( - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - requires( /* Case C: r0, r1, r2 consecutive */ - (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) && - r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0))) - assigns(memory_slice(r0, sizeof(poly))) - assigns(memory_slice(r1, sizeof(poly))) - assigns(memory_slice(r2, sizeof(poly))) - assigns(memory_slice(r3, sizeof(poly))) - ensures( - array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); -); -#endif /* MLKEM_K */ - -#if MLKEM_ETA1 == MLKEM_ETA2 -/* - * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024 - * where MLKEM_ETA2 = MLKEM_ETA1 = 2. - * For ml-kem-512, poly_getnoise_eta1122_4x is used instead. - */ -#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x -#endif /* MLKEM_ETA1 == MLKEM_ETA2 */ - -#if MLKEM_K == 2 || MLKEM_K == 4 -#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2) -/************************************************* - * Name: poly_getnoise_eta2 - * - * Description: Sample a polynomial deterministically from a seed and a nonce, - * with output polynomial close to centered binomial distribution - * with parameter MLKEM_ETA2 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *seed: pointer to input seed - * (of length MLKEM_SYMBYTES bytes) - * - uint8_t nonce: one-byte input nonce - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - assigns(object_whole(r)) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)) -); -#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ - -#if MLKEM_K == 2 -#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x) -/************************************************* - * Name: poly_getnoise_eta1122_4x - * - * Description: Batch sample four polynomials deterministically from a seed - * and a nonces, with output polynomials close to centered binomial - * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2 - * - * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial - * - const uint8_t *seed: pointer to input seed - * (of length MLKEM_SYMBYTES bytes) - * - uint8_t nonce{0,1,2,3}: one-byte input nonce - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, - uint8_t nonce3) -__contract__( - requires( /* r0, r1 consecutive, r2, r3 consecutive */ - (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && - r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3)) - ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)); -); -#endif /* MLKEM_K == 2 */ - -#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.h deleted file mode 100644 index b432a4201..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef REDUCE_H -#define REDUCE_H - -#include -#include "cbmc.h" -#include "common.h" -#include "debug.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) -#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) -#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) -#define fqmul MLKEM_NAMESPACE(fqmul) -#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) -/* End of static namespacing */ - -#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ - -/************************************************* - * Name: cast_uint16_to_int16 - * - * Description: Cast uint16 value to int16 - * - * Returns: - * input x in 0 .. 32767: returns value unchanged - * input x in 32768 .. 65535: returns (x - 65536) - **************************************************/ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "conversion" -#endif -ALWAYS_INLINE -static INLINE int16_t cast_uint16_to_int16(uint16_t x) -{ - /* - * PORTABILITY: This relies on uint16_t -> int16_t - * being implemented as the inverse of int16_t -> uint16_t, - * which is implementation-defined (C99 6.3.1.3 (3)) - * CBMC (correctly) fails to prove this conversion is OK, - * so we have to suppress that check here - */ - return (int16_t)x; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************* - * Name: montgomery_reduce_generic - * - * Description: Generic Montgomery reduction; given a 32-bit integer a, computes - * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 - * - * Arguments: - int32_t a: input integer to be reduced - * - * Returns: integer congruent to a * R^-1 modulo q, with absolute value - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 - * - **************************************************/ -ALWAYS_INLINE -static INLINE int16_t montgomery_reduce_generic(int32_t a) -{ - /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ - const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - - /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ - const uint16_t a_reduced = a & UINT16_MAX; - const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - - /* Lift to signed canonical representative mod 2^16. */ - const int16_t t = cast_uint16_to_int16(a_inverted); - - int32_t r = a - ((int32_t)t * MLKEM_Q); - /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - - /* - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - r = r >> 16; - /* Bounds: |r >> 16| <= ceil(|r| / 2^16) - * <= ceil(|a| / 2^16 + MLKEM_Q / 2) - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * - * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) - */ - - return (int16_t)r; -} - -/************************************************* - * Name: montgomery_reduce - * - * Description: Montgomery reduction - * - * Arguments: - int32_t a: input integer to be reduced - * Must be smaller than 2 * 2^12 * 2^15 in absolute value. - * - * Returns: integer congruent to a * R^-1 modulo q, - * smaller than 2 * q in absolute value. - **************************************************/ -static INLINE int16_t montgomery_reduce(int32_t a) -__contract__( - requires(a > -(2 * UINT12_LIMIT * 32768)) - requires(a < (2 * UINT12_LIMIT * 32768)) - ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); - - res = montgomery_reduce_generic(a); - /* Bounds: - * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 - * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 - * < 2 * MLKEM_Q */ - - debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); - return res; -} - -/************************************************* - * Name: fqmul - * - * Description: Montgomery multiplication modulo q=3329 - * - * Arguments: - int16_t a: first factor - * Can be any int16_t. - * - int16_t b: second factor. - * Must be signed canonical (abs value <(q+1)/2) - * - * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and - * smaller than q in absolute value. - * - **************************************************/ -static INLINE int16_t fqmul(int16_t a, int16_t b) -__contract__( - requires(b > -HALF_Q) - requires(b < HALF_Q) - ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&b, 1, HALF_Q); - - res = montgomery_reduce((int32_t)a * (int32_t)b); - /* Bounds: - * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 - * < MLKEM_Q - */ - - debug_assert_abs_bound(&res, 1, MLKEM_Q); - return res; -} - -/************************************************* - * Name: barrett_reduce - * - * Description: Barrett reduction; given a 16-bit integer a, computes - * centered representative congruent to a mod q in - * {-(q-1)/2,...,(q-1)/2} - * - * Arguments: - int16_t a: input integer to be reduced - * - * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. - **************************************************/ -static INLINE int16_t barrett_reduce(int16_t a) -__contract__( - ensures(return_value > -HALF_Q && return_value < HALF_Q) -) -{ - /* - * To divide by MLKEM_Q using Barrett multiplication, the "magic number" - * multiplier is round_to_nearest(2**26/MLKEM_Q) - */ - const int BPOWER = 26; - const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - - /* - * Compute round_to_nearest(a/MLKEM_Q) using the multiplier - * above and shift by BPOWER places. - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; - - /* - * t is in -10 .. +10, so we need 32-bit math to - * evaluate t * MLKEM_Q and the subsequent subtraction - */ - int16_t res = (int16_t)(a - t * MLKEM_Q); - - debug_assert_abs_bound(&res, 1, HALF_Q); - return res; -} - -#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.c deleted file mode 100644 index cbbe4407f..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.c +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) - -#include "arith_backend.h" -#include "debug.h" -#include "fips202.h" -#include "fips202x4.h" -#include "rej_uniform.h" -#include "symmetric.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define rej_uniform MLKEM_NAMESPACE(rej_uniform) -#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar) -/* End of static namespacing */ - -static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target, - unsigned int offset, const uint8_t *buf, - unsigned int buflen) -__contract__( - requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) - requires(memory_no_alias(r, sizeof(int16_t) * target)) - requires(memory_no_alias(buf, buflen)) - requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * target)) - ensures(offset <= return_value && return_value <= target) - ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) -) -{ - unsigned int ctr, pos; - uint16_t val0, val1; - - debug_assert_bound(r, offset, 0, MLKEM_Q); - - ctr = offset; - pos = 0; - /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */ - while (ctr < target && pos + 3 <= buflen) - __loop__( - invariant(offset <= ctr && ctr <= target && pos <= buflen) - invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q))) - { - val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; - val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; - pos += 3; - - if (val0 < MLKEM_Q) - { - r[ctr++] = val0; - } - if (ctr < target && val1 < MLKEM_Q) - { - r[ctr++] = val1; - } - } - - debug_assert_bound(r, ctr, 0, MLKEM_Q); - return ctr; -} - -#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM) -/************************************************* - * Name: rej_uniform - * - * Description: Run rejection sampling on uniform random bytes to generate - * uniform random integers mod q - * - * Arguments: - int16_t *r: pointer to output buffer - * - unsigned int target: requested number of 16-bit integers - * (uniform mod q). - * Must be <= 4096. - * - unsigned int offset: number of 16-bit integers that have - * already been sampled. - * Must be <= target. - * - const uint8_t *buf: pointer to input buffer - * (assumed to be uniform random bytes) - * - unsigned int buflen: length of input buffer in bytes - * Must be <= 4096. - * Must be a multiple of 3. - * - * Note: Strictly speaking, only a few values of buflen near UINT_MAX need - * excluding. The limit of 4096 is somewhat arbitary but sufficient for all - * uses of this function. Similarly, the actual limit for target is UINT_MAX/2. - * - * Returns the new offset of sampled 16-bit integers, at most target, - * and at least the initial offset. - * If the new offset is strictly less than len, all of the input buffers - * is guaranteed to have been consumed. If it is equal to len, no information - * is provided on how many bytes of the input buffer have been consumed. - **************************************************/ - -/* - * NOTE: The signature differs from the Kyber reference implementation - * in that it adds the offset and always expects the base of the target - * buffer. This avoids shifting the buffer base in the caller, which appears - * tricky to reason about. - */ -static unsigned int rej_uniform(int16_t *r, unsigned int target, - unsigned int offset, const uint8_t *buf, - unsigned int buflen) -__contract__( - requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) - requires(memory_no_alias(r, sizeof(int16_t) * target)) - requires(memory_no_alias(buf, buflen)) - requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * target)) - ensures(offset <= return_value && return_value <= target) - ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) -) -{ - return rej_uniform_scalar(r, target, offset, buf, buflen); -} -#else /* MLKEM_USE_NATIVE_REJ_UNIFORM */ -static unsigned int rej_uniform(int16_t *r, unsigned int target, - unsigned int offset, const uint8_t *buf, - unsigned int buflen) -{ - int ret; - - /* Sample from large buffer with full lane as much as possible. */ - ret = rej_uniform_native(r + offset, target - offset, buf, buflen); - if (ret != -1) - { - unsigned res = offset + (unsigned)ret; - debug_assert_bound(r, res, 0, MLKEM_Q); - return res; - } - - return rej_uniform_scalar(r, target, offset, buf, buflen); -} -#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ - -#ifndef MLKEM_GEN_MATRIX_NBLOCKS -#define MLKEM_GEN_MATRIX_NBLOCKS \ - ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE) -#endif - -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) -{ - /* Temporary buffers for XOF output before rejection sampling */ - uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - - /* Tracks the number of coefficients we have already sampled */ - unsigned int ctr[KECCAK_WAY]; - xof_x4_ctx statex; - unsigned int buflen; - - shake128x4_inc_init(&statex); - - /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */ - xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3], - MLKEM_SYMBYTES + 2); - - /* - * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. - * This should generate the matrix entries with high probability. - */ - xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS, - &statex); - buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; - ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen); - ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen); - ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen); - ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen); - - /* - * So long as not all matrix entries have been generated, squeeze - * one more block a time until we're done. - */ - buflen = XOF_RATE; - while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N || - ctr[3] < MLKEM_N) - __loop__( - assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0), - object_whole(buf1), object_whole(buf2), object_whole(buf3)) - invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N) - invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N) - invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q)) - invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q)) - invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q)) - invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q))) - { - xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex); - ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen); - ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen); - ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen); - ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen); - } - - xof_x4_release(&statex); -} - -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) -{ - xof_ctx state; - uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - unsigned int ctr, buflen; - - shake128_inc_init(&state); - - xof_absorb(&state, seed, MLKEM_SYMBYTES + 2); - - /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. - */ - /* This should generate the matrix entry with high probability. */ - xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state); - buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; - ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen); - - /* Squeeze + sample one more block a time until we're done */ - buflen = XOF_RATE; - while (ctr < MLKEM_N) - __loop__( - assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf)) - invariant(ctr <= MLKEM_N) - invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q))) - { - xof_squeezeblocks(buf, 1, &state); - ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen); - } - - xof_release(&state); -} - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform) -int empty_cu_rej_uniform; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.h deleted file mode 100644 index 801287259..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef REJ_UNIFORM_H -#define REJ_UNIFORM_H - -#include -#include -#include "cbmc.h" -#include "common.h" -#include "poly.h" - -#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4) -/************************************************* - * Name: poly_rej_uniform_x4 - * - * Description: Generate four polynomials using rejection sampling - * on (pseudo-)uniformly random bytes sampled from a seed. - * - * Arguments: - poly *vec: Pointer to an array of 4 polynomials - * to be sampled. - * - uint8_t *seed[4]: Pointer to array of four pointers - * pointing to the seed buffers of size - * MLKEM_SYMBYTES + 2 each. - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) -__contract__( - requires(memory_no_alias(vec, sizeof(poly) * 4)) - requires(memory_no_alias(seed, sizeof(uint8_t*) * 4)) - requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2)) - requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2)) - requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2)) - requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2)) - assigns(memory_slice(vec, sizeof(poly) * 4)) - ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q))); - -#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform) -/************************************************* - * Name: poly_rej_uniform - * - * Description: Generate polynomial using rejection sampling - * on (pseudo-)uniformly random bytes sampled from a seed. - * - * Arguments: - poly *vec: Pointer to polynomial to be sampled. - * - uint8_t *seed: Pointer to seed buffer of size - * MLKEM_SYMBYTES + 2 each. - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) -__contract__( - requires(memory_no_alias(entry, sizeof(poly))) - requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2)) - assigns(memory_slice(entry, sizeof(poly))) - ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q))); - -#endif /* REJ_UNIFORM_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/sampling.c new file mode 100644 index 000000000..98cbdcb74 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/sampling.c @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "common.h" +#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) + +#include "arith_backend.h" +#include "debug.h" +#include "fips202.h" +#include "fips202x4.h" +#include "sampling.h" +#include "symmetric.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define rej_uniform MLKEM_NAMESPACE(rej_uniform) +#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar) +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) +/* End of static namespacing */ + +static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +__contract__( + requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) + requires(memory_no_alias(r, sizeof(int16_t) * target)) + requires(memory_no_alias(buf, buflen)) + requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * target)) + ensures(offset <= return_value && return_value <= target) + ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) +) +{ + unsigned int ctr, pos; + uint16_t val0, val1; + + debug_assert_bound(r, offset, 0, MLKEM_Q); + + ctr = offset; + pos = 0; + /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */ + while (ctr < target && pos + 3 <= buflen) + __loop__( + invariant(offset <= ctr && ctr <= target && pos <= buflen) + invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q))) + { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; + + if (val0 < MLKEM_Q) + { + r[ctr++] = val0; + } + if (ctr < target && val1 < MLKEM_Q) + { + r[ctr++] = val1; + } + } + + debug_assert_bound(r, ctr, 0, MLKEM_Q); + return ctr; +} + +#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM) +/************************************************* + * Name: rej_uniform + * + * Description: Run rejection sampling on uniform random bytes to generate + * uniform random integers mod q + * + * Arguments: - int16_t *r: pointer to output buffer + * - unsigned int target: requested number of 16-bit integers + * (uniform mod q). + * Must be <= 4096. + * - unsigned int offset: number of 16-bit integers that have + * already been sampled. + * Must be <= target. + * - const uint8_t *buf: pointer to input buffer + * (assumed to be uniform random bytes) + * - unsigned int buflen: length of input buffer in bytes + * Must be <= 4096. + * Must be a multiple of 3. + * + * Note: Strictly speaking, only a few values of buflen near UINT_MAX need + * excluding. The limit of 4096 is somewhat arbitary but sufficient for all + * uses of this function. Similarly, the actual limit for target is UINT_MAX/2. + * + * Returns the new offset of sampled 16-bit integers, at most target, + * and at least the initial offset. + * If the new offset is strictly less than len, all of the input buffers + * is guaranteed to have been consumed. If it is equal to len, no information + * is provided on how many bytes of the input buffer have been consumed. + **************************************************/ + +/* + * NOTE: The signature differs from the Kyber reference implementation + * in that it adds the offset and always expects the base of the target + * buffer. This avoids shifting the buffer base in the caller, which appears + * tricky to reason about. + */ +static unsigned int rej_uniform(int16_t *r, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +__contract__( + requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) + requires(memory_no_alias(r, sizeof(int16_t) * target)) + requires(memory_no_alias(buf, buflen)) + requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * target)) + ensures(offset <= return_value && return_value <= target) + ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) +) +{ + return rej_uniform_scalar(r, target, offset, buf, buflen); +} +#else /* MLKEM_USE_NATIVE_REJ_UNIFORM */ +static unsigned int rej_uniform(int16_t *r, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +{ + int ret; + + /* Sample from large buffer with full lane as much as possible. */ + ret = rej_uniform_native(r + offset, target - offset, buf, buflen); + if (ret != -1) + { + unsigned res = offset + (unsigned)ret; + debug_assert_bound(r, res, 0, MLKEM_Q); + return res; + } + + return rej_uniform_scalar(r, target, offset, buf, buflen); +} +#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ + +#ifndef MLKEM_GEN_MATRIX_NBLOCKS +#define MLKEM_GEN_MATRIX_NBLOCKS \ + ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE) +#endif + +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) +{ + /* Temporary buffers for XOF output before rejection sampling */ + uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + + /* Tracks the number of coefficients we have already sampled */ + unsigned int ctr[KECCAK_WAY]; + xof_x4_ctx statex; + unsigned int buflen; + + shake128x4_inc_init(&statex); + + /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */ + xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3], + MLKEM_SYMBYTES + 2); + + /* + * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. + * This should generate the matrix entries with high probability. + */ + xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS, + &statex); + buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; + ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen); + ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen); + ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen); + ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen); + + /* + * So long as not all matrix entries have been generated, squeeze + * one more block a time until we're done. + */ + buflen = XOF_RATE; + while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N || + ctr[3] < MLKEM_N) + __loop__( + assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0), + object_whole(buf1), object_whole(buf2), object_whole(buf3)) + invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N) + invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N) + invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q)) + invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q)) + invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q)) + invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q))) + { + xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex); + ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen); + ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen); + ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen); + ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen); + } + + xof_x4_release(&statex); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) +{ + xof_ctx state; + uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + unsigned int ctr, buflen; + + shake128_inc_init(&state); + + xof_absorb(&state, seed, MLKEM_SYMBYTES + 2); + + /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. + */ + /* This should generate the matrix entry with high probability. */ + xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state); + buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; + ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen); + + /* Squeeze + sample one more block a time until we're done */ + buflen = XOF_RATE; + while (ctr < MLKEM_N) + __loop__( + assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf)) + invariant(ctr <= MLKEM_N) + invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q))) + { + xof_squeezeblocks(buf, 1, &state); + ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen); + } + + xof_release(&state); +} + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) +/* End of static namespacing */ + +/************************************************* + * Name: load32_littleendian + * + * Description: load 4 bytes into a 32-bit integer + * in little-endian order + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x + **************************************************/ +static uint32_t load32_littleendian(const uint8_t x[4]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + r |= (uint32_t)x[3] << 24; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) + { + unsigned j; + uint32_t t = load32_littleendian(buf + 4 * i); + uint32_t d = t & 0x55555555; + d += (t >> 1) & 0x55555555; + + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) + { + const int16_t a = (d >> (4 * j + 0)) & 0x3; + const int16_t b = (d >> (4 * j + 2)) & 0x3; + r->coeffs[8 * i + j] = a - b; + } + } +} + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +/************************************************* + * Name: load24_littleendian + * + * Description: load 3 bytes into a 32-bit integer + * in little-endian order. + * This function is only needed for ML-KEM-512 + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) + **************************************************/ +static uint32_t load24_littleendian(const uint8_t x[3]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 4; i++) + __loop__( + invariant(i <= MLKEM_N / 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) + { + unsigned j; + const uint32_t t = load24_littleendian(buf + 3 * i); + uint32_t d = t & 0x00249249; + d += (t >> 1) & 0x00249249; + d += (t >> 2) & 0x00249249; + + for (j = 0; j < 4; j++) + __loop__( + invariant(i <= MLKEM_N / 4 && j <= 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) + { + const int16_t a = (d >> (6 * j + 0)) & 0x7; + const int16_t b = (d >> (6 * j + 3)) & 0x7; + r->coeffs[4 * i + j] = a - b; + } + } +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ + 3 */ + +#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ + +#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling) +int empty_cu_sampling; + +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/sampling.h new file mode 100644 index 000000000..cc524e0fc --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/sampling.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef SAMPLING_H +#define SAMPLING_H + +#include +#include +#include "cbmc.h" +#include "common.h" +#include "poly.h" + +#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) +/************************************************* + * Name: poly_cbd2 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=2 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) +/************************************************* + * Name: poly_cbd3 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=3. + * This function is only needed for ML-KEM-512 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ + +#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4) +/************************************************* + * Name: poly_rej_uniform_x4 + * + * Description: Generate four polynomials using rejection sampling + * on (pseudo-)uniformly random bytes sampled from a seed. + * + * Arguments: - poly *vec: Pointer to an array of 4 polynomials + * to be sampled. + * - uint8_t *seed[4]: Pointer to array of four pointers + * pointing to the seed buffers of size + * MLKEM_SYMBYTES + 2 each. + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) +__contract__( + requires(memory_no_alias(vec, sizeof(poly) * 4)) + requires(memory_no_alias(seed, sizeof(uint8_t*) * 4)) + requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2)) + requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2)) + requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2)) + requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2)) + assigns(memory_slice(vec, sizeof(poly) * 4)) + ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q))); + +#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform) +/************************************************* + * Name: poly_rej_uniform + * + * Description: Generate polynomial using rejection sampling + * on (pseudo-)uniformly random bytes sampled from a seed. + * + * Arguments: - poly *vec: Pointer to polynomial to be sampled. + * - uint8_t *seed: Pointer to seed buffer of size + * MLKEM_SYMBYTES + 2 each. + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) +__contract__( + requires(memory_no_alias(entry, sizeof(poly))) + requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2)) + assigns(memory_slice(entry, sizeof(poly))) + ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q))); + +#endif /* SAMPLING_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c index 4ef887c62..987f0dce4 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c @@ -10,7 +10,7 @@ #include "common.h" #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) -#include "ntt.h" +#include "poly.h" /* * Table of zeta values used in the reference NTT and inverse NTT. diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/api.h deleted file mode 100644 index 792ecb8a4..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/api.h +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * Native arithmetic interface - * - * This header is primarily for documentation purposes. - * It should not be included by backend implementations. - * - * To ensure consistency with backends, the header will be - * included automatically after inclusion of the active - * backend, to ensure consistency of function signatures, - * and run sanity checks. - */ -#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H -#error \ - "The arithmetic backend API `mlkem/native/api.h` " \ - "should not be directly included. Please include the relevant " \ - "structure headers directly." -#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ -#define MLKEM_NATIVE_ARITH_NATIVE_API_H - -#include -#include "poly.h" -#include "polyvec.h" - -/* - * This is the C<->native interface allowing for the drop-in of - * native code for performance critical arithmetic components of ML-KEM. - * - * A _backend_ is a specific implementation of (part of) this interface. - * - * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and - * implement `static inline xxx(...)` in the profile header. - * - * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can - * be set if there are native implementations for all of NTT, invNTT, and - * base multiplication, and allows the native implementation to use a - * custom order of polynomial coefficients in NTT domain -- the use of such - * custom order is not an implementation-detail since the public matrix - * is generated in NTT domain. In this case, a permutation function - * poly_permute_bitrev_to_custom() needs to be provided that permutes - * polynomials in NTT domain from bitreversed to the custom order. - */ - -/* - * Those functions are meant to be trivial wrappers around the chosen native - * implementation. The are static inline to avoid unnecessary calls. - * The macro before each declaration controls whether a native - * implementation is present. - */ - -#if defined(MLKEM_USE_NATIVE_NTT) -/************************************************* - * Name: ntt_native - * - * Description: Computes negacyclic number-theoretic transform (NTT) of - * a polynomial in place. - * - * The input polynomial is assumed to be in normal order. - * The output polynomial is in bitreversed order, or of a - * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * - * Arguments: - poly *p: pointer to in/output polynomial - **************************************************/ -static INLINE void ntt_native(poly *); -#endif /* MLKEM_USE_NATIVE_NTT */ - -#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER) -/* - * This must only be set if NTT, invNTT, basemul, mulcache, and - * to/from byte stream conversions all have native implementations - * that are adapted to the custom order. - */ -#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \ - !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ - !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) || \ - !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) || \ - !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -#error \ - "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \ -set if there are native implementations for NTT, invNTT, mulcache, basemul, \ -and to/from bytes conversions." -#endif - -/************************************************* - * Name: poly_permute_bitrev_to_custom - * - * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined, - * convert a polynomial in NTT domain from bitreversed - * order to the custom order output by the native NTT. - * - * This must only be defined if there is native code for - * all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache. - * Arguments: - poly *p: pointer to in/output polynomial - * - **************************************************/ -static INLINE void poly_permute_bitrev_to_custom(poly *); -#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ - -#if defined(MLKEM_USE_NATIVE_INTT) -/************************************************* - * Name: intt_native - * - * Description: Computes inverse of negacyclic number-theoretic transform (NTT) - * of a polynomial in place. - * - * The input polynomial is in bitreversed order, or of a - * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * The output polynomial is assumed to be in normal order. - * - * Arguments: - uint16_t *a: pointer to in/output polynomial - **************************************************/ -static INLINE void intt_native(poly *); -#endif /* MLKEM_USE_NATIVE_INTT */ - -#if defined(MLKEM_USE_NATIVE_POLY_REDUCE) -/************************************************* - * Name: poly_reduce_native - * - * Description: Applies modular reduction to all coefficients of a polynomial. - * - * Arguments: - poly *r: pointer to input/output polynomial - **************************************************/ -static INLINE void poly_reduce_native(poly *); -#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ - -#if defined(MLKEM_USE_NATIVE_POLY_TOMONT) -/************************************************* - * Name: poly_tomont_native - * - * Description: Inplace conversion of all coefficients of a polynomial - * from normal domain to Montgomery domain - * - * Arguments: - poly *r: pointer to input/output polynomial - **************************************************/ -static INLINE void poly_tomont_native(poly *); -#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ - -#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) -/************************************************* - * Name: poly_mulcache_compute_native - * - * Description: Compute multiplication cache for a polynomial - * in NTT domain. - * - * The purpose of the multiplication cache is to - * cache repeated computations required during a - * base multiplication of polynomials in NTT domain. - * The structure of the multiplication-cache is - * implementation defined. - * - * Arguments: INPUT: - * - poly: const pointer to input polynomial. - * This must be in NTT domain and inin bitreversed order, or of - * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * OUTPUT - * - cache: pointer to multiplication cache - **************************************************/ -static INLINE void poly_mulcache_compute_native(poly_mulcache *cache, - const poly *poly); -#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ - -#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) -/************************************************* - * Name: poly_mulcache_compute_native - * - * Description: Compute multiplication of polynomials in NTT domain. - * - * Arguments: INPUT: - * - a: First polynomial operand. - * This must be in NTT domain and inin bitreversed order, or of - * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. - * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER - * for more information. - * - b: Second polynomial operand. - * As for a. - * - b_cache: Multiplication-cache for b. - * OUTPUT - * - r: Result of the base multiplication. This is again - * in NTT domain, and of the same order as a and b. - **************************************************/ -static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache); -#endif - -#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES) -/************************************************* - * Name: poly_tobytes_native - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - a: const pointer to input polynomial, - * with each coefficient in the range -Q+1 .. Q-1 - * OUTPUT - * - r: pointer to output byte array - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a); -#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ - -#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -/************************************************* - * Name: poly_frombytes_native - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - r: pointer to output polynomial in NTT domain - * OUTPUT - * - a: const pointer to input byte aray - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -static INLINE void poly_frombytes_native(poly *a, - const uint8_t r[MLKEM_POLYBYTES]); -#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ - -#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM) -/************************************************* - * Name: rej_uniform_native - * - * Description: Run rejection sampling on uniform random bytes to generate - * uniform random integers mod q - * - * Arguments: - int16_t *r: pointer to output buffer - * - unsigned int len: requested number of 16-bit integers - * (uniform mod q). - * - const uint8_t *buf: pointer to input buffer - * (assumed to be uniform random bytes) - * - unsigned int buflen: length of input buffer in bytes. - * - * Return -1 if the native implementation does not support the input lengths. - * Otherwise, returns non-negative number of sampled 16-bit integers (at most - * len). - **************************************************/ -static INLINE int rej_uniform_native(int16_t *r, unsigned int len, - const uint8_t *buf, unsigned int buflen); -#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ - -#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/arith_backend.h index 0543b1bd1..ade31cda1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/arith_backend.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/arith_backend.h @@ -17,7 +17,7 @@ * Keep this _after_ the inclusion of the backend; otherwise, * the sanity checks won't have an effect. */ #if defined(MLKEM_NATIVE_CHECK_APIS) -#include "api.h" +#include "native/api.h" #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.c deleted file mode 100644 index 1e6b7c5d1..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED - -#include -#include "cbd.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) -#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) -/* End of static namespacing */ - -/************************************************* - * Name: load32_littleendian - * - * Description: load 4 bytes into a 32-bit integer - * in little-endian order - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x - **************************************************/ -static uint32_t load32_littleendian(const uint8_t x[4]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - r |= (uint32_t)x[3] << 24; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) - { - unsigned j; - uint32_t t = load32_littleendian(buf + 4 * i); - uint32_t d = t & 0x55555555; - d += (t >> 1) & 0x55555555; - - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) - { - const int16_t a = (d >> (4 * j + 0)) & 0x3; - const int16_t b = (d >> (4 * j + 2)) & 0x3; - r->coeffs[8 * i + j] = a - b; - } - } -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -/************************************************* - * Name: load24_littleendian - * - * Description: load 3 bytes into a 32-bit integer - * in little-endian order. - * This function is only needed for ML-KEM-512 - * - * Arguments: - const uint8_t *x: pointer to input byte array - * - * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) - **************************************************/ -static uint32_t load24_littleendian(const uint8_t x[3]) -{ - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - return r; -} - -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 4; i++) - __loop__( - invariant(i <= MLKEM_N / 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) - { - unsigned j; - const uint32_t t = load24_littleendian(buf + 3 * i); - uint32_t d = t & 0x00249249; - d += (t >> 1) & 0x00249249; - d += (t >> 2) & 0x00249249; - - for (j = 0; j < 4; j++) - __loop__( - invariant(i <= MLKEM_N / 4 && j <= 4) - invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) - { - const int16_t a = (d >> (6 * j + 0)) & 0x7; - const int16_t b = (d >> (6 * j + 3)) & 0x7; - r->coeffs[4 * i + j] = a - b; - } - } -} -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ - 3 */ - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd) -int empty_cu_cbd; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.h deleted file mode 100644 index 54c1f5b90..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef CBD_H -#define CBD_H - -#include -#include "common.h" -#include "poly.h" - -#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) -/************************************************* - * Name: poly_cbd2 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=2 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 -#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) -/************************************************* - * Name: poly_cbd3 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter eta=3. - * This function is only needed for ML-KEM-512 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ - -#endif /* CBD_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h index 4f326333e..62ed53ab1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h @@ -15,12 +15,19 @@ #include "sys.h" /* Include backend metadata */ -#if defined(MLKEM_USE_NATIVE) -#if defined(MLKEM_NATIVE_ARITH_BACKEND) -#include MLKEM_NATIVE_ARITH_BACKEND +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#include MLKEM_NATIVE_ARITH_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not. +#endif #endif -#if defined(MLKEM_NATIVE_FIPS202_BACKEND) -#include MLKEM_NATIVE_FIPS202_BACKEND + +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#include MLKEM_NATIVE_FIPS202_BACKEND_FILE +#else +#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not. #endif #endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/compress.c new file mode 100644 index 000000000..a03fe0ac4 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/compress.c @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "common.h" +#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) + +#include +#include +#include "arith_backend.h" +#include "cbmc.h" +#include "compress.h" +#include "debug.h" +#include "verify.h" + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 16))) + { + t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); + } + + r[i * 4] = t[0] | (t[1] << 4); + r[i * 4 + 1] = t[2] | (t[3] << 4); + r[i * 4 + 2] = t[4] | (t[5] << 4); + r[i * 4 + 3] = t[6] | (t[7] << 4); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + for (j = 0; j < MLKEM_N / 4; j++) + __loop__(invariant(j <= MLKEM_N / 4)) + { + unsigned k; + uint16_t t[4]; + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(forall(r, 0, k, t[r] < (1u << 10)))) + { + t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 10-bit in size. + */ + r[5 * j + 0] = (t[0] >> 0) & 0xFF; + r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); + r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); + r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); + r[5 * j + 4] = (t[3] >> 2); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) + { + r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); + r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 4; j++) + __loop__( + invariant(j <= MLKEM_N / 4) + invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[4]; + uint8_t const *base = &a[5 * j]; + + t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); + t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); + t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); + + for (k = 0; k < 4; k++) + __loop__( + invariant(k <= 4) + invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) + { + r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + uint8_t t[8] = {0}; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_bound(t, 0, j, 0, 32))) + { + t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); + } + + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC, and use array indexing into + * r rather than pointer-arithmetic to simplify verification + */ + r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); + r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); + r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); + r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); + r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) +{ + unsigned j; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (j = 0; j < MLKEM_N / 8; j++) + __loop__(invariant(j <= MLKEM_N / 8)) + { + unsigned k; + uint16_t t[8]; + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(forall(r, 0, k, t[r] < (1u << 11)))) + { + t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); + } + + /* + * Make all implicit truncation explicit. No data is being + * truncated for the LHS's since each t[i] is 11-bit in size. + */ + r[11 * j + 0] = (t[0] >> 0) & 0xFF; + r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); + r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); + r[11 * j + 3] = (t[2] >> 2) & 0xFF; + r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); + r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); + r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); + r[11 * j + 7] = (t[5] >> 1) & 0xFF; + r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); + r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); + r[11 * j + 10] = (t[7] >> 3); + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + uint8_t t[8]; + const unsigned offset = i * 5; + /* + * Explicitly truncate to avoid warning about + * implicit truncation in CBMC and unwind loop for ease + * of proof. + */ + + /* + * Decompress 5 8-bit bytes (so 40 bits) into + * 8 5-bit values stored in t[] + */ + t[0] = 0x1F & (a[offset + 0] >> 0); + t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); + t[2] = 0x1F & (a[offset + 1] >> 2); + t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); + t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); + t[5] = 0x1F & (a[offset + 3] >> 1); + t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); + t[7] = 0x1F & (a[offset + 4] >> 3); + + /* and copy to the correct slice in r[] */ + for (j = 0; j < 8; j++) + __loop__( + invariant(j <= 8 && i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +{ + unsigned j; + for (j = 0; j < MLKEM_N / 8; j++) + __loop__( + invariant(j <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) + { + unsigned k; + uint16_t t[8]; + uint8_t const *base = &a[11 * j]; + t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); + t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); + t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | + ((uint16_t)base[4] << 10)); + t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); + t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); + t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | + ((uint16_t)base[8] << 9)); + t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); + t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); + + for (k = 0; k < 8; k++) + __loop__( + invariant(k <= 8) + invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) + { + r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); + } + } + + debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); +} +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ + +#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 2; i++) + __loop__(invariant(i <= MLKEM_N / 2)) + { + const uint16_t t0 = a->coeffs[2 * i]; + const uint16_t t1 = a->coeffs[2 * i + 1]; + /* + * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of + * significant data, so these can be packed into 24 bits or exactly + * 3 bytes, as follows. + */ + + /* Least significant bits 0 - 7 of t0. */ + r[3 * i + 0] = t0 & 0xFF; + + /* + * Most significant bits 8 - 11 of t0 become the least significant + * nibble of the second byte. The least significant 4 bits + * of t1 become the upper nibble of the second byte. + */ + r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + + /* Bits 4 - 11 of t1 become the third byte. */ + r[3 * i + 2] = t1 >> 4; + } +} +#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +{ + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + poly_tobytes_native(r, a->coeffs); +} +#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + +#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 2; i++) + __loop__( + invariant(i <= MLKEM_N / 2) + invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) + { + const uint8_t t0 = a[3 * i + 0]; + const uint8_t t1 = a[3 * i + 1]; + const uint8_t t2 = a[3 * i + 2]; + r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); + r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); + } + + /* Note that the coefficients are not canonical */ + debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); +} +#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +{ + poly_frombytes_native(r->coeffs, a); +} +#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ + +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +{ + unsigned i; +#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) +#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" +#endif + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) + { + unsigned j; + for (j = 0; j < 8; j++) + __loop__( + invariant(i < MLKEM_N / 8 && j <= 8) + invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) + { + /* Prevent the compiler from recognizing this as a bit selection */ + uint8_t mask = value_barrier_u8(1u << j); + r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); + } + } + debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) +{ + unsigned i; + debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_N / 8; i++) + __loop__(invariant(i <= MLKEM_N / 8)) + { + unsigned j; + msg[i] = 0; + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8)) + { + uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); + msg[i] |= t << j; + } + } +} + +#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ + +#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress) +int empty_cu_compress; + +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/compress.h new file mode 100644 index 000000000..409dbe519 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/compress.h @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef COMPRESS_H +#define COMPRESS_H + +#include +#include +#include "cbmc.h" +#include "common.h" +#include "debug.h" +#include "poly.h" +#include "verify.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) +#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) +#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) +#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) +#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) +#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) +#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) +#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) +#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) +/* End of static namespacing */ + +/************************************************************ + * Name: scalar_compress_d1 + * + * Description: Computes round(u * 2 / q) + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 1. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d1(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 2) + ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) +{ + uint32_t d0 = u << 1; + d0 *= 645083; + d0 += 1u << 30; + d0 >>= 31; + return d0; +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_compress_d4 + * + * Description: Computes round(u * 16 / q) % 16 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d4(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 16) + ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) +{ + uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ + return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d4 + * + * Description: Computes round(u * q / 16) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 4. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d4(uint32_t u) +__contract__( + requires(0 <= u && u < 16) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 8) / 16; } + +/************************************************************ + * Name: scalar_compress_d5 + * + * Description: Computes round(u * 32 / q) % 32 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d5(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < 32) + ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) +{ + uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ + return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d5 + * + * Description: Computes round(u * q / 32) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 5. + * + * Arguments: - u: Unsigned canonical modulus modulo 32 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d5(uint32_t u) +__contract__( + requires(0 <= u && u < 32) + ensures(return_value <= MLKEM_Q - 1) +) { return ((u * MLKEM_Q) + 16) / 32; } + +/************************************************************ + * Name: scalar_compress_d10 + * + * Description: Computes round(u * 2**10 / q) % 2**10 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d10(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 10)) + ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) +{ + uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x3FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d10 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d10(uint32_t u) +__contract__( + requires(0 <= u && u < 1024) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 512) / 1024; } + +/************************************************************ + * Name: scalar_compress_d11 + * + * Description: Computes round(u * 2**11 / q) % 2**11 + * + * Implements Compress_d from FIPS203, Eq (4.7), + * for d = 11. + * + * Arguments: - u: Unsigned canonical modulus modulo q + * to be compressed. + ************************************************************/ +/* + * The multiplication in this routine will exceed UINT32_MAX + * and wrap around for large values of u. This is expected and required. + */ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "unsigned-overflow" +#endif +static INLINE uint32_t scalar_compress_d11(uint16_t u) +__contract__( + requires(u <= MLKEM_Q - 1) + ensures(return_value < (1u << 11)) + ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) +{ + uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ + d0 = (d0 + ((uint64_t)1u << 32)) >> 33; + return (d0 & 0x7FF); +} +#ifdef CBMC +#pragma CPROVER check pop +#endif + +/************************************************************ + * Name: scalar_decompress_d11 + * + * Description: Computes round(u * q / 1024) + * + * Implements Decompress_d from FIPS203, Eq (4.8), + * for d = 10. + * + * Arguments: - u: Unsigned canonical modulus modulo 16 + * to be decompressed. + ************************************************************/ +static INLINE uint16_t scalar_decompress_d11(uint32_t u) +__contract__( + requires(0 <= u && u < 2048) + ensures(return_value <= (MLKEM_Q - 1)) +) { return ((u * MLKEM_Q) + 1024) / 2048; } + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ + (MLKEM_K == 2 || MLKEM_K == 3) +#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) +/************************************************* + * Name: poly_compress_d4 + * + * Description: Compression (4 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); + +#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) +/************************************************* + * Name: poly_compress_d10 + * + * Description: Compression (10 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); + +#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) +/************************************************* + * Name: poly_decompress_d4 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); + +#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) +/************************************************* + * Name: poly_decompress_d10 + * + * Description: De-serialization and subsequent decompression (10 bits) of a + * polynomial; approximate inverse of poly_compress_d10 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d10(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ + || MLKEM_K == 3) */ + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 +#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) +/************************************************* + * Name: poly_compress_d5 + * + * Description: Compression (5 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); + +#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) +/************************************************* + * Name: poly_compress_d11 + * + * Description: Compression (11 bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); + +#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) +/************************************************* + * Name: poly_decompress_d5 + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); + +#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) +/************************************************* + * Name: poly_decompress_d11 + * + * Description: De-serialization and subsequent decompression (11 bits) of a + * polynomial; approximate inverse of poly_compress_d11 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_decompress_d11(poly *r, + const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ + */ + +#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) +/************************************************* + * Name: poly_tobytes + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - a: const pointer to input polynomial, + * with each coefficient in the range [0,1,..,Q-1] + * OUTPUT + * - r: pointer to output byte array + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYBYTES)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(r)) +); + + +#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) +/************************************************* + * Name: poly_frombytes + * + * Description: De-serialization of a polynomial. + * + * Arguments: INPUT + * - a: pointer to input byte array + * (of MLKEM_POLYBYTES bytes) + * OUTPUT + * - r: pointer to output polynomial, with + * each coefficient unsigned and in the range + * 0 .. 4095 + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) +); + + +#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) +/************************************************* + * Name: poly_frommsg + * + * Description: Convert 32-byte message to polynomial + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *msg: pointer to input message + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(object_whole(r)) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) +); + +#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) +/************************************************* + * Name: poly_tomsg + * + * Description: Convert polynomial to 32-byte message + * + * Arguments: - uint8_t *msg: pointer to output message + * - const poly *r: pointer to input polynomial + * Coefficients must be unsigned canonical + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) +__contract__( + requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) + requires(memory_no_alias(r, sizeof(poly))) + requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(msg)) +); + +#endif /* COMPRESS_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h index fa89370ce..e975ede95 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h @@ -122,46 +122,87 @@ /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ /****************************************************************************** - * Name: MLKEM_USE_NATIVE + * Name: MLKEM_USE_NATIVE_BACKEND_ARITH * - * Description: Determines whether a native backend should - * be used, if available. + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. * * This can also be set using CFLAGS. * *****************************************************************************/ -#if !defined(MLKEM_USE_NATIVE) -/* #define MLKEM_USE_NATIVE */ +#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH) +/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */ #endif /****************************************************************************** - * Name: MLKEM_NATIVE_ARITH_BACKEND + * Name: MLKEM_NATIVE_ARITH_BACKEND_FILE * * Description: The arithmetic backend to use. * - * This must be the filename of an arithmetic backend. - * See the existing backends for examples. + * If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND) -#define MLKEM_NATIVE_ARITH_BACKEND "default.h" -#endif /* MLKEM_NATIVE_ARITH_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \ + !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE) +#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h" +#endif /****************************************************************************** - * Name: MLKEM_NATIVE_FIPS202_BACKEND + * Name: MLKEM_USE_NATIVE_BACKEND_FIPS202 + * + * Description: Determines whether an native FIPS202 backend should be used. + * + * The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is + * the performance bottleneck of SHA3 and SHAKE. + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the FIPS202 backend to be use is + * determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) +/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */ +#endif + +/****************************************************************************** + * Name: MLKEM_NATIVE_FIPS202_BACKEND_FILE * * Description: The FIPS-202 backend to use. * - * This must be the filename of an FIPS-202 backend. + * If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must + * either be undefined or the filename of a FIPS202 backend. + * If unset, the default backend will be used. * * This can be set using CFLAGS. * *****************************************************************************/ -#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND) -#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h" -#endif /* MLKEM_NATIVE_FIPS202_BACKEND */ +#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \ + !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE) +#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h" +#endif /************************* Config internals ********************************/ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/default.h deleted file mode 100644 index d1e41c52e..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/default.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H -#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H - -/* - * Default arithmetic backend - */ -#include "sys.h" - -#ifdef SYS_AARCH64 -/* - * For AArch64, we currently we have one clean and one opt profile. - * We default to the opt profile. - * - * In the future, this may branch further depending on the microarchitecture. - */ -#include "aarch64/opt.h" -#endif /* SYS_AARCH64 */ - -#ifdef SYS_X86_64_AVX2 -/* - * For now, there's only one x86_64 profile, based on - * the AVX2 code from the Kyber repository. - * https://github.com/pq-crystals/kyber - */ -#include "x86_64/default.h" -#endif /* SYS_X86_64 */ - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.c index 0cfcc3e9e..318d0fc77 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.c @@ -9,11 +9,10 @@ #include "fips202.h" #include "fips202x4.h" #include "indcpa.h" -#include "ntt.h" #include "poly.h" -#include "polyvec.h" +#include "poly_k.h" #include "randombytes.h" -#include "rej_uniform.h" +#include "sampling.h" #include "symmetric.h" #include "arith_backend.h" @@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v, #define poly_permute_bitrev_to_custom \ MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom) -static INLINE void poly_permute_bitrev_to_custom(poly *data) +static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N]) __contract__( /* We don't specify that this should be a permutation, but only * that it does not change the bound established at the end of gen_matrix. */ - requires(memory_no_alias(data, sizeof(poly))) - requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N)) + requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q)) assigns(memory_slice(data, sizeof(poly))) - ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } + ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); } #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ /* Not static for benchmarking */ @@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed) { for (j = 0; j < MLKEM_K; j++) { - poly_permute_bitrev_to_custom(&a[i].vec[j]); + poly_permute_bitrev_to_custom(a[i].vec[j].coeffs); } } } diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.h index 2c4fda3c4..b4d5985bf 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.h @@ -8,7 +8,7 @@ #include #include "cbmc.h" #include "common.h" -#include "polyvec.h" +#include "poly_k.h" #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix) /************************************************* diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/api.h new file mode 100644 index 000000000..0704f9dcd --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/api.h @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * Native arithmetic interface + * + * This header is primarily for documentation purposes. + * It should not be included by backend implementations. + * + * To ensure consistency with backends, the header will be + * included automatically after inclusion of the active + * backend, to ensure consistency of function signatures, + * and run sanity checks. + */ +#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H +#error \ + "The arithmetic backend API `mlkem/native/api.h` " \ + "should not be directly included. Please include the relevant " \ + "structure headers directly." +#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ +#define MLKEM_NATIVE_ARITH_NATIVE_API_H + +#include +#include "../common.h" + +/* + * This is the C<->native interface allowing for the drop-in of + * native code for performance critical arithmetic components of ML-KEM. + * + * A _backend_ is a specific implementation of (part of) this interface. + * + * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and + * implement `static inline xxx(...)` in the profile header. + * + * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can + * be set if there are native implementations for all of NTT, invNTT, and + * base multiplication, and allows the native implementation to use a + * custom order of polynomial coefficients in NTT domain -- the use of such + * custom order is not an implementation-detail since the public matrix + * is generated in NTT domain. In this case, a permutation function + * poly_permute_bitrev_to_custom() needs to be provided that permutes + * polynomials in NTT domain from bitreversed to the custom order. + */ + +/* + * Those functions are meant to be trivial wrappers around the chosen native + * implementation. The are static inline to avoid unnecessary calls. + * The macro before each declaration controls whether a native + * implementation is present. + */ + +#if defined(MLKEM_USE_NATIVE_NTT) +/************************************************* + * Name: ntt_native + * + * Description: Computes negacyclic number-theoretic transform (NTT) of + * a polynomial in place. + * + * The input polynomial is assumed to be in normal order. + * The output polynomial is in bitreversed order, or of a + * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial + **************************************************/ +static INLINE void ntt_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_NTT */ + +#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER) +/* + * This must only be set if NTT, invNTT, basemul, mulcache, and + * to/from byte stream conversions all have native implementations + * that are adapted to the custom order. + */ +#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) || \ + !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) || \ + !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +#error \ + "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \ +set if there are native implementations for NTT, invNTT, mulcache, basemul, \ +and to/from bytes conversions." +#endif + +/************************************************* + * Name: poly_permute_bitrev_to_custom + * + * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined, + * convert a polynomial in NTT domain from bitreversed + * order to the custom order output by the native NTT. + * + * This must only be defined if there is native code for + * all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache. + * Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial + * + **************************************************/ +static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */ + +#if defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: intt_native + * + * Description: Computes inverse of negacyclic number-theoretic transform (NTT) + * of a polynomial in place. + * + * The input polynomial is in bitreversed order, or of a + * custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * The output polynomial is assumed to be in normal order. + * + * Arguments: - uint16_t *a: pointer to in/output polynomial + **************************************************/ +static INLINE void intt_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_INTT */ + +#if defined(MLKEM_USE_NATIVE_POLY_REDUCE) +/************************************************* + * Name: poly_reduce_native + * + * Description: Applies modular reduction to all coefficients of a polynomial. + * + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial + **************************************************/ +static INLINE void poly_reduce_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ + +#if defined(MLKEM_USE_NATIVE_POLY_TOMONT) +/************************************************* + * Name: poly_tomont_native + * + * Description: Inplace conversion of all coefficients of a polynomial + * from normal domain to Montgomery domain + * + * Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial + **************************************************/ +static INLINE void poly_tomont_native(int16_t p[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ + +#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) +/************************************************* + * Name: poly_mulcache_compute_native + * + * Description: Compute multiplication cache for a polynomial + * in NTT domain. + * + * The purpose of the multiplication cache is to + * cache repeated computations required during a + * base multiplication of polynomials in NTT domain. + * The structure of the multiplication-cache is + * implementation defined. + * + * Arguments: INPUT: + * - poly: const pointer to input polynomial. + * This must be in NTT domain and inin bitreversed order, or of + * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * OUTPUT + * - cache: pointer to multiplication cache + **************************************************/ +static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2], + const int16_t poly[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ + +#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) +/************************************************* + * Name: poly_mulcache_compute_native + * + * Description: Compute multiplication of polynomials in NTT domain. + * + * Arguments: INPUT: + * - a: First polynomial operand. + * This must be in NTT domain and inin bitreversed order, or of + * a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set. + * See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER + * for more information. + * - b: Second polynomial operand. + * As for a. + * - b_cache: Multiplication-cache for b. + * OUTPUT + * - r: Result of the base multiplication. This is again + * in NTT domain, and of the same order as a and b. + **************************************************/ +static INLINE void polyvec_basemul_acc_montgomery_cached_native( + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]); +#endif + +#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES) +/************************************************* + * Name: poly_tobytes_native + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - a: const pointer to input polynomial, + * with each coefficient in the range -Q+1 .. Q-1 + * OUTPUT + * - r: pointer to output byte array + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], + const int16_t a[MLKEM_N]); +#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + +#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) +/************************************************* + * Name: poly_frombytes_native + * + * Description: Serialization of a polynomial. + * Signed coefficients are converted to + * unsigned form before serialization. + * + * Arguments: INPUT: + * - r: pointer to output polynomial in NTT domain + * OUTPUT + * - a: const pointer to input byte aray + * (of MLKEM_POLYBYTES bytes) + **************************************************/ +static INLINE void poly_frombytes_native(int16_t a[MLKEM_N], + const uint8_t r[MLKEM_POLYBYTES]); +#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ + +#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM) +/************************************************* + * Name: rej_uniform_native + * + * Description: Run rejection sampling on uniform random bytes to generate + * uniform random integers mod q + * + * Arguments: - int16_t *r: pointer to output buffer + * - unsigned int len: requested number of 16-bit integers + * (uniform mod q). + * - const uint8_t *buf: pointer to input buffer + * (assumed to be uniform random bytes) + * - unsigned int buflen: length of input buffer in bytes. + * + * Return -1 if the native implementation does not support the input lengths. + * Otherwise, returns non-negative number of sampled 16-bit integers (at most + * len). + **************************************************/ +static INLINE int rej_uniform_native(int16_t *r, unsigned int len, + const uint8_t *buf, unsigned int buflen); +#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ + +#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/default.h new file mode 100644 index 000000000..f9fe4310a --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/default.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H +#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H + +/* + * Default arithmetic backend + */ +#include "../sys.h" + +#ifdef SYS_AARCH64 +/* + * For AArch64, we currently we have one clean and one opt profile. + * We default to the opt profile. + * + * In the future, this may branch further depending on the microarchitecture. + */ +#include "aarch64/opt.h" +#endif /* SYS_AARCH64 */ + +#ifdef SYS_X86_64_AVX2 +/* + * For now, there's only one x86_64 profile, based on + * the AVX2 code from the Kyber repository. + * https://github.com/pq-crystals/kyber + */ +#include "x86_64/default.h" +#endif /* SYS_X86_64 */ + +#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/README.md similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/README.md rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/README.md diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/default.h similarity index 90% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/default.h rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/default.h index 592e8996d..73f53dc13 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/default.h @@ -19,6 +19,6 @@ /* Filename of the C backend implementation. * This is not inlined here because this header is included in assembly * files as well. */ -#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "x86_64/src/default_impl.h" +#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/x86_64/src/default_impl.h" #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/align.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/align.h similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/align.h rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/align.h diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/arith_native_x86_64.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/arith_native_x86_64.h similarity index 91% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/arith_native_x86_64.h rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/arith_native_x86_64.h index 25e00a930..acde977ad 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/arith_native_x86_64.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/arith_native_x86_64.h @@ -5,11 +5,10 @@ #ifndef MLKEM_X86_64_NATIVE_H #define MLKEM_X86_64_NATIVE_H -#include "common.h" +#include "../../../common.h" #include #include -#include "polyvec.h" #include "consts.h" #define REJ_UNIFORM_AVX_NBLOCKS 3 /* See MLKEM_GEN_MATRIX_NBLOCKS */ @@ -44,8 +43,9 @@ void basemul_avx2(__m256i *r, const __m256i *a, const __m256i *b, #define polyvec_basemul_acc_montgomery_cached_avx2 \ MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_avx2) void polyvec_basemul_acc_montgomery_cached_avx2( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache); + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]); #define ntttobytes_avx2 MLKEM_NAMESPACE(ntttobytes_avx2) void ntttobytes_avx2(uint8_t *r, const __m256i *a, const __m256i *qdata); diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.S rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S index b97840e70..5fdc3d0a0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S @@ -6,7 +6,7 @@ // Implementation from Kyber reference repository // https://github.com/pq-crystals/kyber/blob/main/avx2 -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) #include "consts.h" diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.c similarity index 51% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.c rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.c index 5f9ae99c8..970938306 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.c @@ -3,46 +3,46 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) -#include "poly.h" -#include "polyvec.h" - #include "arith_native_x86_64.h" #include "consts.h" -static void poly_basemul_montgomery_avx2(poly *r, const poly *a, const poly *b) +static void poly_basemul_montgomery_avx2(int16_t r[MLKEM_N], + const int16_t a[MLKEM_N], + const int16_t b[MLKEM_N]) { - basemul_avx2((__m256i *)r->coeffs, (const __m256i *)a->coeffs, - (const __m256i *)b->coeffs, qdata.vec); + basemul_avx2((__m256i *)r, (const __m256i *)a, (const __m256i *)b, qdata.vec); } /* * Implementation from Kyber reference repository * https://github.com/pq-crystals/kyber/blob/main/avx2 */ -static void poly_add_avx2(poly *r, const poly *a, const poly *b) +static void poly_add_avx2(int16_t r[MLKEM_N], const int16_t a[MLKEM_N], + const int16_t b[MLKEM_N]) { unsigned i; __m256i f0, f1; for (i = 0; i < MLKEM_N; i += 16) { - f0 = _mm256_load_si256((const __m256i *)&a->coeffs[i]); - f1 = _mm256_load_si256((const __m256i *)&b->coeffs[i]); + f0 = _mm256_load_si256((const __m256i *)&a[i]); + f1 = _mm256_load_si256((const __m256i *)&b[i]); f0 = _mm256_add_epi16(f0, f1); - _mm256_store_si256((__m256i *)&r->coeffs[i], f0); + _mm256_store_si256((__m256i *)&r[i], f0); } } -void polyvec_basemul_acc_montgomery_cached_avx2(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) +void polyvec_basemul_acc_montgomery_cached_avx2( + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]) { unsigned i; - poly t; + int16_t t[MLKEM_N] ALIGN; /* TODO: Use mulcache for AVX2. So far, it is unused. */ ((void)b_cache); @@ -50,11 +50,11 @@ void polyvec_basemul_acc_montgomery_cached_avx2(poly *r, const polyvec *a, /* Coefficient-wise bound of each basemul is 2q. * Since we are accumulating at most 4 times, the * overall bound is 8q < INT16_MAX. */ - poly_basemul_montgomery_avx2(r, &a->vec[0], &b->vec[0]); + poly_basemul_montgomery_avx2(r, &a[0], &b[0]); for (i = 1; i < MLKEM_K; i++) { - poly_basemul_montgomery_avx2(&t, &a->vec[i], &b->vec[i]); - poly_add_avx2(r, r, &t); + poly_basemul_montgomery_avx2(t, &a[i * MLKEM_N], &b[i * MLKEM_N]); + poly_add_avx2(r, r, t); } } diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.c similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.c rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.c index 86a0835ef..568752ae8 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.c @@ -8,7 +8,7 @@ * https://github.com/pq-crystals/kyber/blob/main/avx2/consts.c */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.h similarity index 97% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.h rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.h index 00c415952..e2846b609 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.h @@ -11,7 +11,7 @@ #ifndef CONSTS_H #define CONSTS_H -#include "common.h" +#include "../../../common.h" #define AVX2_BACKEND_DATA_OFFSET_16XQ 0 #define AVX2_BACKEND_DATA_OFFSET_16XQINV 16 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/default_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/default_impl.h similarity index 62% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/default_impl.h rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/default_impl.h index 029111c17..3683361e2 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/default_impl.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/default_impl.h @@ -12,8 +12,7 @@ #include -#include "poly.h" -#include "polyvec.h" +#include "../../../params.h" #include "arith_native_x86_64.h" #define MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER @@ -28,9 +27,9 @@ #define MLKEM_USE_NATIVE_POLY_TOBYTES #define MLKEM_USE_NATIVE_POLY_FROMBYTES -static INLINE void poly_permute_bitrev_to_custom(poly *data) +static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N]) { - nttunpack_avx2((__m256i *)(data->coeffs), qdata.vec); + nttunpack_avx2((__m256i *)(data), qdata.vec); } static INLINE int rej_uniform_native(int16_t *r, unsigned int len, @@ -45,27 +44,28 @@ static INLINE int rej_uniform_native(int16_t *r, unsigned int len, return (int)rej_uniform_avx2(r, buf); } -static INLINE void ntt_native(poly *data) +static INLINE void ntt_native(int16_t data[MLKEM_N]) { ntt_avx2((__m256i *)data, qdata.vec); } -static INLINE void intt_native(poly *data) +static INLINE void intt_native(int16_t data[MLKEM_N]) { invntt_avx2((__m256i *)data, qdata.vec); } -static INLINE void poly_reduce_native(poly *data) +static INLINE void poly_reduce_native(int16_t data[MLKEM_N]) { - reduce_avx2((__m256i *)data->coeffs, qdata.vec); + reduce_avx2((__m256i *)data, qdata.vec); } -static INLINE void poly_tomont_native(poly *data) +static INLINE void poly_tomont_native(int16_t data[MLKEM_N]) { - tomont_avx2((__m256i *)data->coeffs, qdata.vec); + tomont_avx2((__m256i *)data, qdata.vec); } -static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y) +static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2], + const int16_t y[MLKEM_N]) { /* AVX2 backend does not use mulcache */ ((void)y); @@ -73,22 +73,23 @@ static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y) } static INLINE void polyvec_basemul_acc_montgomery_cached_native( - poly *r, const polyvec *a, const polyvec *b, - const polyvec_mulcache *b_cache) + int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], + const int16_t b[MLKEM_K * MLKEM_N], + const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]) { polyvec_basemul_acc_montgomery_cached_avx2(r, a, b, b_cache); } static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const poly *a) + const int16_t a[MLKEM_N]) { - ntttobytes_avx2(r, (const __m256i *)a->coeffs, qdata.vec); + ntttobytes_avx2(r, (const __m256i *)a, qdata.vec); } -static INLINE void poly_frombytes_native(poly *r, +static INLINE void poly_frombytes_native(int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYBYTES]) { - nttfrombytes_avx2((__m256i *)r->coeffs, a, qdata.vec); + nttfrombytes_avx2((__m256i *)r, a, qdata.vec); } #endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S similarity index 98% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.S rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S index 134bd4f71..3f013a5fa 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S @@ -11,7 +11,7 @@ // in [0,1,...,q-1] rather than [0,1,...,q], matching the // semantics of poly_reduce(). -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) #include "consts.h" diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.inc b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.inc similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.inc rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.inc diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/intt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S index 6b1d78ef2..7b1f22624 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/intt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S @@ -9,7 +9,7 @@ * Changes to placement of modular reductions have * been made to simplify reasoning of non-overflow */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/ntt.S rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S index e8bf7894b..5d928b4cc 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/ntt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S @@ -6,7 +6,7 @@ // Implementation from Kyber reference repository // https://github.com/pq-crystals/kyber/blob/main/avx2 -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) #include "consts.h" diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_avx2.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_avx2.c similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_avx2.c rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_avx2.c index 54037a0df..adf2d338b 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_avx2.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_avx2.c @@ -8,7 +8,7 @@ * https://github.com/pq-crystals/kyber/blob/main/avx2 */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_table.c similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_table.c rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_table.c index 9bbc47146..e95fd9e79 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_table.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_table.c @@ -8,7 +8,7 @@ * Do not modify it directly. */ -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S similarity index 99% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.S rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S index 5e708748a..9bcd04896 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S @@ -6,7 +6,7 @@ // Implementation from Kyber reference repository // https://github.com/pq-crystals/kyber/blob/main/avx2 -#include "common.h" +#include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.inc b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.inc similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.inc rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.inc diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/x86_64_zetas.i b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/x86_64_zetas.i similarity index 100% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/x86_64_zetas.i rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/x86_64_zetas.i diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.c deleted file mode 100644 index 3651c8da9..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) - -#include -#include "arith_backend.h" -#include "debug.h" -#include "ntt.h" -#include "reduce.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) -#define ntt_layer MLKEM_NAMESPACE(ntt_layer) -#define invntt_layer MLKEM_NAMESPACE(invntt_layer) -/* End of static namespacing */ - -#if !defined(MLKEM_USE_NATIVE_NTT) -/* - * Computes a block CT butterflies with a fixed twiddle factor, - * using Montgomery multiplication. - * Parameters: - * - r: Pointer to base of polynomial (_not_ the base of butterfly block) - * - root: Twiddle factor to use for the butterfly. This must be in - * Montgomery form and signed canonical. - * - start: Offset to the beginning of the butterfly block - * - len: Index difference between coefficients subject to a butterfly - * - bound: Ghost variable describing coefficient bound: Prior to `start`, - * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, - * they must be bound by `bound`. - * When this function returns, output coefficients in the index range - * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. - * Example: - * - start=8, len=4 - * This would compute the following four butterflies - * 8 -- 12 - * 9 -- 13 - * 10 -- 14 - * 11 -- 15 - * - start=4, len=2 - * This would compute the following two butterflies - * 4 -- 6 - * 5 -- 7 - */ -static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, - unsigned start, unsigned len, int bound) -__contract__( - requires(start < MLKEM_N) - requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) - requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) - requires(-HALF_Q < zeta && zeta < HALF_Q) - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) - requires(array_abs_bound(r, start, MLKEM_N, bound)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) - ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) -{ - /* `bound` is a ghost variable only needed in the CBMC specification */ - unsigned j; - ((void)bound); - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - /* - * Coefficients are updated in strided pairs, so the bounds for the - * intermediate states alternate twice between the old and new bound - */ - invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j, start + len, bound)) - invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) - invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) - { - int16_t t; - t = fqmul(r[j + len], zeta); - r[j + len] = r[j] - t; - r[j] = r[j] + t; - } -} - -/* - *Compute one layer of forward NTT - * Parameters: - * - r: Pointer to base of polynomial - * - len: Stride of butterflies in this layer. - * - layer: Ghost variable indicating which layer is being applied. - * Must match `len` via `len == MLKEM_N >> layer`. - * Note: `len` could be dropped and computed in the function, but - * we are following the structure of the reference NTT from the - * official Kyber implementation here, merely adding `layer` as - * a ghost variable for the specifications. - */ -static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) - requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable only needed in the CBMC specification */ - ((void)layer); - /* Twiddle factors for layer n start at index 2^(layer-1) */ - k = MLKEM_N / (2 * len); - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(start < MLKEM_N + 2 * len) - invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) - invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) - invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) - { - int16_t zeta = zetas[k++]; - ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); - } -} - -/* - * Compute full forward NTT - * NOTE: This particular implementation satisfies a much tighter - * bound on the output coefficients (5*q) than the contractual one (8*q), - * but this is not needed in the calling code. Should we change the - * base multiplication strategy to require smaller NTT output bounds, - * the proof may need strengthening. - */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - unsigned len, layer; - int16_t *r; - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - r = p->coeffs; - - for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) - __loop__( - invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) - invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) - { - ntt_layer(r, len, layer); - } - - /* Check the stronger bound */ - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_NTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *p) -{ - debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); - ntt_native(p); - debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_NTT */ - -#if !defined(MLKEM_USE_NATIVE_INTT) - -/* Compute one layer of inverse NTT */ -static void invntt_layer(int16_t *r, unsigned len, unsigned layer) -__contract__( - requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) - requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) - requires(len == (1 << (8 - layer))) - requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) - ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) -{ - unsigned start, k; - /* `layer` is a ghost variable used only in the specification */ - ((void)layer); - k = MLKEM_N / len - 1; - for (start = 0; start < MLKEM_N; start += 2 * len) - __loop__( - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) - invariant(start <= MLKEM_N && k <= 127) - /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ - invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) - { - unsigned j; - int16_t zeta = zetas[k--]; - for (j = start; j < start + len; j++) - __loop__( - invariant(start <= j && j <= start + len) - invariant(start <= MLKEM_N && k <= 127) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - int16_t t = r[j]; - r[j] = barrett_reduce(t + r[j + len]); - r[j + len] = r[j + len] - t; - r[j + len] = fqmul(r[j + len], zeta); - } - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - /* - * Scale input polynomial to account for Montgomery factor - * and NTT twist. This also brings coefficients down to - * absolute value < MLKEM_Q. - */ - unsigned j, len, layer; - const int16_t f = 1441; - int16_t *r = p->coeffs; - - for (j = 0; j < MLKEM_N; j++) - __loop__( - invariant(j <= MLKEM_N) - invariant(array_abs_bound(r, 0, j, MLKEM_Q))) - { - r[j] = fqmul(r[j], f); - } - - /* Run the invNTT layers */ - for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) - __loop__( - invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) - invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) - { - invntt_layer(p->coeffs, len, layer); - } - - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#else /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *p) -{ - intt_native(p); - debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); -} -#endif /* MLKEM_USE_NATIVE_INTT */ - -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -{ - int32_t t0, t1; - debug_assert_bound(a, 2, 0, UINT12_LIMIT); - - t0 = (int32_t)a[1] * b_cached; - t0 += (int32_t)a[0] * b[0]; - t1 = (int32_t)a[0] * b[1]; - t1 += (int32_t)a[1] * b[0]; - - /* |ti| < 2 * q * 2^15 */ - r[0] = montgomery_reduce(t0); - r[1] = montgomery_reduce(t1); - - debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); -} - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt) -int empty_cu_ntt; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.h deleted file mode 100644 index 4e80d3ab3..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef NTT_H -#define NTT_H -#include "common.h" - -#include -#include "cbmc.h" -#include "poly.h" -#include "reduce.h" - -#define zetas MLKEM_NAMESPACE(zetas) -extern const int16_t zetas[128]; - -#define poly_ntt MLKEM_NAMESPACE(poly_ntt) -/************************************************* - * Name: poly_ntt - * - * Description: Computes negacyclic number-theoretic transform (NTT) of - * a polynomial in place. - * - * The input is assumed to be in normal order and - * coefficient-wise bound by MLKEM_Q in absolute value. - * - * The output polynomial is in bitreversed order, and - * coefficient-wise bound by NTT_BOUND in absolute value. - * - * (NOTE: Sometimes the input to the NTT is actually smaller, - * which gives better bounds.) - * - * Arguments: - poly *p: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_ntt(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) -); - -#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) -/************************************************* - * Name: poly_invntt_tomont - * - * Description: Computes inverse of negacyclic number-theoretic transform (NTT) - * of a polynomial in place; - * inputs assumed to be in bitreversed order, output in normal - * order - * - * The input is assumed to be in bitreversed order, and can - * have arbitrary coefficients in int16_t. - * - * The output polynomial is in normal order, and - * coefficient-wise bound by INVNTT_BOUND in absolute value. - * - * Arguments: - uint16_t *a: pointer to in/output polynomial - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_invntt_tomont(poly *r) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) -); - -#define basemul_cached MLKEM_NAMESPACE(basemul_cached) -/************************************************************ - * Name: basemul_cached - * - * Description: Computes a representative modulo q of - * (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536 - * - * If b_cached is b1*zeta, this represents the - * product of (a0 + a1*X) and (b0 + b1*X) in - * Fq[X]/(X^2 - zeta). - * - * Arguments: - r: Pointer to output polynomial - * Upon return, coefficients are bound by - * 2*MLKEM_Q in absolute value. - * - a: Pointer to first input polynomial - * Every coefficient must be in [0..4095] - * - b: Pointer to second input polynomial - * Can have arbitrary int16_t coefficients - * - b_cached: Some precomputed value, typically derived from - * b1 and a twiddle factor. Can be an arbitary int16_t. - ************************************************************/ -MLKEM_NATIVE_INTERNAL_API -void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], - int16_t b_cached) -__contract__( - requires(memory_no_alias(r, 2 * sizeof(int16_t))) - requires(memory_no_alias(a, 2 * sizeof(int16_t))) - requires(memory_no_alias(b, 2 * sizeof(int16_t))) - requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) - assigns(memory_slice(r, 2 * sizeof(int16_t))) - ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)) -); - -#endif /* NTT_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/params.h index 57ea4c8ba..7f6c12625 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/params.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/params.h @@ -18,6 +18,7 @@ #define MLKEM_N 256 #define MLKEM_Q 3329 #define UINT12_LIMIT 4096 +#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define MLKEM_SSBYTES 32 /* size in bytes of shared key */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.c index 7483ebf6d..e8a2e2c6e 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.c @@ -8,388 +8,246 @@ #include #include #include "arith_backend.h" -#include "cbd.h" #include "cbmc.h" #include "debug.h" #include "fips202x4.h" -#include "ntt.h" #include "poly.h" -#include "reduce.h" +#include "sampling.h" #include "symmetric.h" #include "verify.h" -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3) -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 16))) - { - t[j] = scalar_compress_d4(a->coeffs[8 * i + j]); - } - - r[i * 4] = t[0] | (t[1] << 4); - r[i * 4 + 1] = t[2] | (t[3] << 4); - r[i * 4 + 2] = t[4] | (t[5] << 4); - r[i * 4 + 3] = t[6] | (t[7] << 4); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - for (j = 0; j < MLKEM_N / 4; j++) - __loop__(invariant(j <= MLKEM_N / 4)) - { - unsigned k; - uint16_t t[4]; - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(forall(r, 0, k, t[r] < (1u << 10)))) - { - t[k] = scalar_compress_d10(a->coeffs[4 * j + k]); - } - - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 10-bit in size. - */ - r[5 * j + 0] = (t[0] >> 0) & 0xFF; - r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF); - r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF); - r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF); - r[5 * j + 4] = (t[3] >> 2); - } -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q))) - { - r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF); - r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF); - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]) +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) +#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) +#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) +#define fqmul MLKEM_NAMESPACE(fqmul) +#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) +#define basemul_cached MLKEM_NAMESPACE(basemul_cached) +#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) +#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block) +#define ntt_layer MLKEM_NAMESPACE(ntt_layer) +#define invntt_layer MLKEM_NAMESPACE(invntt_layer) +/* End of static namespacing */ + +/************************************************* + * Name: cast_uint16_to_int16 + * + * Description: Cast uint16 value to int16 + * + * Returns: + * input x in 0 .. 32767: returns value unchanged + * input x in 32768 .. 65535: returns (x - 65536) + **************************************************/ +#ifdef CBMC +#pragma CPROVER check push +#pragma CPROVER check disable "conversion" +#endif +ALWAYS_INLINE +static INLINE int16_t cast_uint16_to_int16(uint16_t x) { - unsigned j; - for (j = 0; j < MLKEM_N / 4; j++) - __loop__( - invariant(j <= MLKEM_N / 4) - invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[4]; - uint8_t const *base = &a[5 * j]; - - t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6)); - t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4)); - t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2)); - - for (k = 0; k < 4; k++) - __loop__( - invariant(k <= 4) - invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q))) - { - r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + /* + * PORTABILITY: This relies on uint16_t -> int16_t + * being implemented as the inverse of int16_t -> uint16_t, + * which is implementation-defined (C99 6.3.1.3 (3)) + * CBMC (correctly) fails to prove this conversion is OK, + * so we have to suppress that check here + */ + return (int16_t)x; } -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ +#ifdef CBMC +#pragma CPROVER check pop +#endif -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a) +/************************************************* + * Name: montgomery_reduce_generic + * + * Description: Generic Montgomery reduction; given a 32-bit integer a, computes + * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 + * + * Arguments: - int32_t a: input integer to be reduced + * + * Returns: integer congruent to a * R^-1 modulo q, with absolute value + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 + * + **************************************************/ +ALWAYS_INLINE +static INLINE int16_t montgomery_reduce_generic(int32_t a) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ + const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - uint8_t t[8] = {0}; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8) - invariant(array_bound(t, 0, j, 0, 32))) - { - t[j] = scalar_compress_d5(a->coeffs[8 * i + j]); - } + /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ + const uint16_t a_reduced = a & UINT16_MAX; + const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC, and use array indexing into - * r rather than pointer-arithmetic to simplify verification - */ - r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5)); - r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); - r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4)); - r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); - r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3)); - } -} + /* Lift to signed canonical representative mod 2^16. */ + const int16_t t = cast_uint16_to_int16(a_inverted); -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a) -{ - unsigned j; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + int32_t r = a - ((int32_t)t * MLKEM_Q); + /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - for (j = 0; j < MLKEM_N / 8; j++) - __loop__(invariant(j <= MLKEM_N / 8)) - { - unsigned k; - uint16_t t[8]; - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(forall(r, 0, k, t[r] < (1u << 11)))) - { - t[k] = scalar_compress_d11(a->coeffs[8 * j + k]); - } + /* + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + r = r >> 16; + /* Bounds: |r >> 16| <= ceil(|r| / 2^16) + * <= ceil(|a| / 2^16 + MLKEM_Q / 2) + * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * + * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) + */ - /* - * Make all implicit truncation explicit. No data is being - * truncated for the LHS's since each t[i] is 11-bit in size. - */ - r[11 * j + 0] = (t[0] >> 0) & 0xFF; - r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF); - r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF); - r[11 * j + 3] = (t[2] >> 2) & 0xFF; - r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF); - r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF); - r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF); - r[11 * j + 7] = (t[5] >> 1) & 0xFF; - r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF); - r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF); - r[11 * j + 10] = (t[7] >> 3); - } + return (int16_t)r; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]) +/************************************************* + * Name: montgomery_reduce + * + * Description: Montgomery reduction + * + * Arguments: - int32_t a: input integer to be reduced + * Must be smaller than 2 * 2^12 * 2^15 in absolute value. + * + * Returns: integer congruent to a * R^-1 modulo q, + * smaller than 2 * q in absolute value. + **************************************************/ +static INLINE int16_t montgomery_reduce(int32_t a) +__contract__( + requires(a > -(2 * UINT12_LIMIT * 32768)) + requires(a < (2 * UINT12_LIMIT * 32768)) + ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) +) { - unsigned i; - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - uint8_t t[8]; - const unsigned offset = i * 5; - /* - * Explicitly truncate to avoid warning about - * implicit truncation in CBMC and unwind loop for ease - * of proof. - */ - - /* - * Decompress 5 8-bit bytes (so 40 bits) into - * 8 5-bit values stored in t[] - */ - t[0] = 0x1F & (a[offset + 0] >> 0); - t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3)); - t[2] = 0x1F & (a[offset + 1] >> 2); - t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1)); - t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4)); - t[5] = 0x1F & (a[offset + 3] >> 1); - t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2)); - t[7] = 0x1F & (a[offset + 4] >> 3); - - /* and copy to the correct slice in r[] */ - for (j = 0; j < 8; j++) - __loop__( - invariant(j <= 8 && i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]); - } - } - - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + int16_t res; + debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); + + res = montgomery_reduce_generic(a); + /* Bounds: + * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 + * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 + * < 2 * MLKEM_Q */ + + debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); + return res; } -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]) +#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: fqmul + * + * Description: Montgomery multiplication modulo q=3329 + * + * Arguments: - int16_t a: first factor + * Can be any int16_t. + * - int16_t b: second factor. + * Must be signed canonical (abs value <(q+1)/2) + * + * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and + * smaller than q in absolute value. + * + **************************************************/ +static INLINE int16_t fqmul(int16_t a, int16_t b) +__contract__( + requires(b > -HALF_Q) + requires(b < HALF_Q) + ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) +) { - unsigned j; - for (j = 0; j < MLKEM_N / 8; j++) - __loop__( - invariant(j <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q))) - { - unsigned k; - uint16_t t[8]; - uint8_t const *base = &a[11 * j]; - t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8)); - t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5)); - t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) | - ((uint16_t)base[4] << 10)); - t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7)); - t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4)); - t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) | - ((uint16_t)base[8] << 9)); - t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6)); - t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3)); - - for (k = 0; k < 8; k++) - __loop__( - invariant(k <= 8) - invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q))) - { - r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]); - } - } + int16_t res; + debug_assert_abs_bound(&b, 1, HALF_Q); + + res = montgomery_reduce((int32_t)a * (int32_t)b); + /* Bounds: + * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 + * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 + * < MLKEM_Q + */ - debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); + debug_assert_abs_bound(&res, 1, MLKEM_Q); + return res; } -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */ - -#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) || \ + !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \ + !defined(MLKEM_USE_NATIVE_NTT) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT) +/************************************************* + * Name: barrett_reduce + * + * Description: Barrett reduction; given a 16-bit integer a, computes + * centered representative congruent to a mod q in + * {-(q-1)/2,...,(q-1)/2} + * + * Arguments: - int16_t a: input integer to be reduced + * + * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. + **************************************************/ +static INLINE int16_t barrett_reduce(int16_t a) +__contract__( + ensures(return_value > -HALF_Q && return_value < HALF_Q) +) { - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_N / 2; i++) - __loop__(invariant(i <= MLKEM_N / 2)) - { - const uint16_t t0 = a->coeffs[2 * i]; - const uint16_t t1 = a->coeffs[2 * i + 1]; - /* - * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of - * significant data, so these can be packed into 24 bits or exactly - * 3 bytes, as follows. - */ - - /* Least significant bits 0 - 7 of t0. */ - r[3 * i + 0] = t0 & 0xFF; - - /* - * Most significant bits 8 - 11 of t0 become the least significant - * nibble of the second byte. The least significant 4 bits - * of t1 become the upper nibble of the second byte. - */ - r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0); + /* + * To divide by MLKEM_Q using Barrett multiplication, the "magic number" + * multiplier is round_to_nearest(2**26/MLKEM_Q) + */ + const int BPOWER = 26; + const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - /* Bits 4 - 11 of t1 become the third byte. */ - r[3 * i + 2] = t1 >> 4; - } -} -#else /* MLKEM_USE_NATIVE_POLY_TOBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -{ - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); - poly_tobytes_native(r, a); -} -#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */ + /* + * Compute round_to_nearest(a/MLKEM_Q) using the multiplier + * above and shift by BPOWER places. + * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, + * implementation-defined for negative left argument. Here, + * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) + */ + const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; -#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES) -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - unsigned i; - for (i = 0; i < MLKEM_N / 2; i++) - __loop__( - invariant(i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT))) - { - const uint8_t t0 = a[3 * i + 0]; - const uint8_t t1 = a[3 * i + 1]; - const uint8_t t2 = a[3 * i + 2]; - r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF); - r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4); - } + /* + * t is in -10 .. +10, so we need 32-bit math to + * evaluate t * MLKEM_Q and the subsequent subtraction + */ + int16_t res = (int16_t)(a - t * MLKEM_Q); - /* Note that the coefficients are not canonical */ - debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT); -} -#else /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -{ - poly_frombytes_native(r, a); + debug_assert_abs_bound(&res, 1, HALF_Q); + return res; } -#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */ - -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) +#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \ + !defined(MLKEM_USE_NATIVE_INTT) */ + +static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], + int16_t b_cached) +__contract__( + requires(memory_no_alias(r, 2 * sizeof(int16_t))) + requires(memory_no_alias(a, 2 * sizeof(int16_t))) + requires(memory_no_alias(b, 2 * sizeof(int16_t))) + requires(array_bound(a, 0, 2, 0, UINT12_LIMIT)) + assigns(memory_slice(r, 2 * sizeof(int16_t))) + ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))) { - unsigned i; -#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8) -#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!" -#endif + int32_t t0, t1; + debug_assert_bound(a, 2, 0, UINT12_LIMIT); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__( - invariant(i <= MLKEM_N / 8) - invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q))) - { - unsigned j; - for (j = 0; j < 8; j++) - __loop__( - invariant(i < MLKEM_N / 8 && j <= 8) - invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q))) - { - /* Prevent the compiler from recognizing this as a bit selection */ - uint8_t mask = value_barrier_u8(1u << j); - r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask); - } - } - debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); -} + t0 = (int32_t)a[1] * b_cached; + t0 += (int32_t)a[0] * b[0]; + t1 = (int32_t)a[0] * b[1]; + t1 += (int32_t)a[1] * b[0]; -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a) -{ - unsigned i; - debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q); + /* |ti| < 2 * q * 2^15 */ + r[0] = montgomery_reduce(t0); + r[1] = montgomery_reduce(t1); - for (i = 0; i < MLKEM_N / 8; i++) - __loop__(invariant(i <= MLKEM_N / 8)) - { - unsigned j; - msg[i] = 0; - for (j = 0; j < 8; j++) - __loop__( - invariant(i <= MLKEM_N / 8 && j <= 8)) - { - uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]); - msg[i] |= t << j; - } - } + debug_assert_abs_bound(r, 2, 2 * MLKEM_Q); } MLKEM_NATIVE_INTERNAL_API @@ -434,12 +292,46 @@ void poly_tomont(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_tomont(poly *r) { - poly_tomont_native(r); + poly_tomont_native(r->coeffs); debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */ #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) +/************************************************************ + * Name: scalar_signed_to_unsigned_q + * + * Description: converts signed polynomial coefficient + * from signed (-3328 .. 3328) form to + * unsigned form (0 .. 3328). + * + * Note: Cryptographic constant time implementation + * + * Examples: 0 -> 0 + * 1 -> 1 + * 3328 -> 3328 + * -1 -> 3328 + * -2 -> 3327 + * -3328 -> 1 + * + * Arguments: c: signed coefficient to be converted + ************************************************************/ +static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) +__contract__( + requires(c > -MLKEM_Q && c < MLKEM_Q) + ensures(return_value >= 0 && return_value < MLKEM_Q) + ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) +{ + debug_assert_abs_bound(&c, 1, MLKEM_Q); + + /* Add Q if c is negative, but in constant time */ + c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); + + /* and therefore cast to uint16_t is safe. */ + debug_assert_bound(&c, 1, 0, MLKEM_Q); + return (uint16_t)c; +} + MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { @@ -461,7 +353,7 @@ void poly_reduce(poly *r) MLKEM_NATIVE_INTERNAL_API void poly_reduce(poly *r) { - poly_reduce_native(r); + poly_reduce_native(r->coeffs); debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q); } #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */ @@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a) MLKEM_NATIVE_INTERNAL_API void poly_mulcache_compute(poly_mulcache *x, const poly *a) { - poly_mulcache_compute_native(x, a); + poly_mulcache_compute_native(x->coeffs, a->coeffs); /* Omitting bounds assertion since native implementations may * decide not to use a mulcache. Note that the C backend implementation * of poly_basemul_montgomery_cached() does still include the check. */ } #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */ +#if !defined(MLKEM_USE_NATIVE_NTT) +/* + * Computes a block CT butterflies with a fixed twiddle factor, + * using Montgomery multiplication. + * Parameters: + * - r: Pointer to base of polynomial (_not_ the base of butterfly block) + * - root: Twiddle factor to use for the butterfly. This must be in + * Montgomery form and signed canonical. + * - start: Offset to the beginning of the butterfly block + * - len: Index difference between coefficients subject to a butterfly + * - bound: Ghost variable describing coefficient bound: Prior to `start`, + * coefficients must be bound by `bound + MLKEM_Q`. Post `start`, + * they must be bound by `bound`. + * When this function returns, output coefficients in the index range + * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`. + * Example: + * - start=8, len=4 + * This would compute the following four butterflies + * 8 -- 12 + * 9 -- 13 + * 10 -- 14 + * 11 -- 15 + * - start=4, len=2 + * This would compute the following two butterflies + * 4 -- 6 + * 5 -- 7 + */ +static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta, + unsigned start, unsigned len, int bound) +__contract__( + requires(start < MLKEM_N) + requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N) + requires(0 <= bound && bound < INT16_MAX - MLKEM_Q) + requires(-HALF_Q < zeta && zeta < HALF_Q) + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(array_abs_bound(r, 0, start, bound + MLKEM_Q)) + requires(array_abs_bound(r, start, MLKEM_N, bound)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q)) + ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound))) +{ + /* `bound` is a ghost variable only needed in the CBMC specification */ + unsigned j; + ((void)bound); + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + /* + * Coefficients are updated in strided pairs, so the bounds for the + * intermediate states alternate twice between the old and new bound + */ + invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j, start + len, bound)) + invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q)) + invariant(array_abs_bound(r, j + len, MLKEM_N, bound))) + { + int16_t t; + t = fqmul(r[j + len], zeta); + r[j + len] = r[j] - t; + r[j] = r[j] + t; + } +} + +/* + *Compute one layer of forward NTT + * Parameters: + * - r: Pointer to base of polynomial + * - len: Stride of butterflies in this layer. + * - layer: Ghost variable indicating which layer is being applied. + * Must match `len` via `len == MLKEM_N >> layer`. + * Note: `len` could be dropped and computed in the function, but + * we are following the structure of the reference NTT from the + * official Kyber implementation here, merely adding `layer` as + * a ghost variable for the specifications. + */ +static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer)) + requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable only needed in the CBMC specification */ + ((void)layer); + /* Twiddle factors for layer n start at index 2^(layer-1) */ + k = MLKEM_N / (2 * len); + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(start < MLKEM_N + 2 * len) + invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N) + invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q)) + invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))) + { + int16_t zeta = zetas[k++]; + ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q); + } +} + +/* + * Compute full forward NTT + * NOTE: This particular implementation satisfies a much tighter + * bound on the output coefficients (5*q) than the contractual one (8*q), + * but this is not needed in the calling code. Should we change the + * base multiplication strategy to require smaller NTT output bounds, + * the proof may need strengthening. + */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + unsigned len, layer; + int16_t *r; + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + r = p->coeffs; + + for (len = 128, layer = 1; len >= 2; len >>= 1, layer++) + __loop__( + invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer)) + invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))) + { + ntt_layer(r, len, layer); + } + + /* Check the stronger bound */ + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_NTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *p) +{ + debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q); + ntt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_NTT */ + +#if !defined(MLKEM_USE_NATIVE_INTT) + +/* Compute one layer of inverse NTT */ +static void invntt_layer(int16_t *r, unsigned len, unsigned layer) +__contract__( + requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N)) + requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7) + requires(len == (1 << (8 - layer))) + requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N)) + ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) +{ + unsigned start, k; + /* `layer` is a ghost variable used only in the specification */ + ((void)layer); + k = MLKEM_N / len - 1; + for (start = 0; start < MLKEM_N; start += 2 * len) + __loop__( + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)) + invariant(start <= MLKEM_N && k <= 127) + /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */ + invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len)) + { + unsigned j; + int16_t zeta = zetas[k--]; + for (j = start; j < start + len; j++) + __loop__( + invariant(start <= j && j <= start + len) + invariant(start <= MLKEM_N && k <= 127) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + int16_t t = r[j]; + r[j] = barrett_reduce(t + r[j + len]); + r[j + len] = r[j + len] - t; + r[j + len] = fqmul(r[j + len], zeta); + } + } +} + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + /* + * Scale input polynomial to account for Montgomery factor + * and NTT twist. This also brings coefficients down to + * absolute value < MLKEM_Q. + */ + unsigned j, len, layer; + const int16_t f = 1441; + int16_t *r = p->coeffs; + + for (j = 0; j < MLKEM_N; j++) + __loop__( + invariant(j <= MLKEM_N) + invariant(array_abs_bound(r, 0, j, MLKEM_Q))) + { + r[j] = fqmul(r[j], f); + } + + /* Run the invNTT layers */ + for (len = 2, layer = 7; len <= 128; len <<= 1, layer--) + __loop__( + invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer))) + invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))) + { + invntt_layer(p->coeffs, len, layer); + } + + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#else /* MLKEM_USE_NATIVE_INTT */ + +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *p) +{ + intt_native(p->coeffs); + debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND); +} +#endif /* MLKEM_USE_NATIVE_INTT */ + #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.h index 6a14c785d..cb0d67c1a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.h @@ -9,7 +9,7 @@ #include #include "cbmc.h" #include "common.h" -#include "reduce.h" +#include "debug.h" #include "verify.h" /* Absolute exclusive upper bound for the output of the inverse NTT */ @@ -18,6 +18,9 @@ /* Absolute exclusive upper bound for the output of the forward NTT */ #define NTT_BOUND (8 * MLKEM_Q) +#define zetas MLKEM_NAMESPACE(zetas) +extern const int16_t zetas[128]; + /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1] @@ -38,520 +41,6 @@ typedef struct int16_t coeffs[MLKEM_N >> 1]; } poly_mulcache; -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1) -#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4) -#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5) -#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10) -#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11) -#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4) -#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5) -#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10) -#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11) -#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q) -/* End of static namespacing */ - -/************************************************************ - * Name: scalar_compress_d1 - * - * Description: Computes round(u * 2 / q) - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 1. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d1(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 2) - ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) ) -{ - uint32_t d0 = u << 1; - d0 *= 645083; - d0 += 1u << 30; - d0 >>= 31; - return d0; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_compress_d4 - * - * Description: Computes round(u * 16 / q) % 16 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d4(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 16) - ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16)) -{ - uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */ - return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d4 - * - * Description: Computes round(u * q / 16) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 4. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d4(uint32_t u) -__contract__( - requires(0 <= u && u < 16) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 8) / 16; } - -/************************************************************ - * Name: scalar_compress_d5 - * - * Description: Computes round(u * 32 / q) % 32 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d5(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < 32) - ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) ) -{ - uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */ - return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */ -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d5 - * - * Description: Computes round(u * q / 32) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 5. - * - * Arguments: - u: Unsigned canonical modulus modulo 32 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d5(uint32_t u) -__contract__( - requires(0 <= u && u < 32) - ensures(return_value <= MLKEM_Q - 1) -) { return ((u * MLKEM_Q) + 16) / 32; } - -/************************************************************ - * Name: scalar_compress_d10 - * - * Description: Computes round(u * 2**10 / q) % 2**10 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d10(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 10)) - ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10))) -{ - uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x3FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d10 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d10(uint32_t u) -__contract__( - requires(0 <= u && u < 1024) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 512) / 1024; } - -/************************************************************ - * Name: scalar_compress_d11 - * - * Description: Computes round(u * 2**11 / q) % 2**11 - * - * Implements Compress_d from FIPS203, Eq (4.7), - * for d = 11. - * - * Arguments: - u: Unsigned canonical modulus modulo q - * to be compressed. - ************************************************************/ -/* - * The multiplication in this routine will exceed UINT32_MAX - * and wrap around for large values of u. This is expected and required. - */ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "unsigned-overflow" -#endif -static INLINE uint32_t scalar_compress_d11(uint16_t u) -__contract__( - requires(u <= MLKEM_Q - 1) - ensures(return_value < (1u << 11)) - ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11))) -{ - uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */ - d0 = (d0 + ((uint64_t)1u << 32)) >> 33; - return (d0 & 0x7FF); -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************************ - * Name: scalar_decompress_d11 - * - * Description: Computes round(u * q / 1024) - * - * Implements Decompress_d from FIPS203, Eq (4.8), - * for d = 10. - * - * Arguments: - u: Unsigned canonical modulus modulo 16 - * to be decompressed. - ************************************************************/ -static INLINE uint16_t scalar_decompress_d11(uint32_t u) -__contract__( - requires(0 <= u && u < 2048) - ensures(return_value <= (MLKEM_Q - 1)) -) { return ((u * MLKEM_Q) + 1024) / 2048; } - -/************************************************************ - * Name: scalar_signed_to_unsigned_q - * - * Description: converts signed polynomial coefficient - * from signed (-3328 .. 3328) form to - * unsigned form (0 .. 3328). - * - * Note: Cryptographic constant time implementation - * - * Examples: 0 -> 0 - * 1 -> 1 - * 3328 -> 3328 - * -1 -> 3328 - * -2 -> 3327 - * -3328 -> 1 - * - * Arguments: c: signed coefficient to be converted - ************************************************************/ -static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c) -__contract__( - requires(c > -MLKEM_Q && c < MLKEM_Q) - ensures(return_value >= 0 && return_value < MLKEM_Q) - ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q))) -{ - debug_assert_abs_bound(&c, 1, MLKEM_Q); - - /* Add Q if c is negative, but in constant time */ - c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c)); - - /* and therefore cast to uint16_t is safe. */ - debug_assert_bound(&c, 1, 0, MLKEM_Q); - return (uint16_t)c; -} - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \ - (MLKEM_K == 2 || MLKEM_K == 3) -#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4) -/************************************************* - * Name: poly_compress_d4 - * - * Description: Compression (4 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a); - -#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10) -/************************************************* - * Name: poly_compress_d10 - * - * Description: Compression (10 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a); - -#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4) -/************************************************* - * Name: poly_decompress_d4 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]); - -#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10) -/************************************************* - * Name: poly_decompress_d10 - * - * Description: De-serialization and subsequent decompression (10 bits) of a - * polynomial; approximate inverse of poly_compress_d10 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d10(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \ - || MLKEM_K == 3) */ - -#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 -#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5) -/************************************************* - * Name: poly_compress_d5 - * - * Description: Compression (5 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a); - -#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11) -/************************************************* - * Name: poly_compress_d11 - * - * Description: Compression (11 bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a); - -#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5) -/************************************************* - * Name: poly_decompress_d5 - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]); - -#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11) -/************************************************* - * Name: poly_decompress_d11 - * - * Description: De-serialization and subsequent decompression (11 bits) of a - * polynomial; approximate inverse of poly_compress_d11 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_decompress_d11(poly *r, - const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]); -#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \ - */ - -#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes) -/************************************************* - * Name: poly_tobytes - * - * Description: Serialization of a polynomial. - * Signed coefficients are converted to - * unsigned form before serialization. - * - * Arguments: INPUT: - * - a: const pointer to input polynomial, - * with each coefficient in the range [0,1,..,Q-1] - * OUTPUT - * - r: pointer to output byte array - * (of MLKEM_POLYBYTES bytes) - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYBYTES)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(r)) -); - - -#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes) -/************************************************* - * Name: poly_frombytes - * - * Description: De-serialization of a polynomial. - * - * Arguments: INPUT - * - a: pointer to input byte array - * (of MLKEM_POLYBYTES bytes) - * OUTPUT - * - r: pointer to output polynomial, with - * each coefficient unsigned and in the range - * 0 .. 4095 - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)) -); - - -#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg) -/************************************************* - * Name: poly_frommsg - * - * Description: Convert 32-byte message to polynomial - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *msg: pointer to input message - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES]) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(object_whole(r)) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) -); - -#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg) -/************************************************* - * Name: poly_tomsg - * - * Description: Convert polynomial to 32-byte message - * - * Arguments: - uint8_t *msg: pointer to output message - * - const poly *r: pointer to input polynomial - * Coefficients must be unsigned canonical - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r) -__contract__( - requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES)) - requires(memory_no_alias(r, sizeof(poly))) - requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(msg)) -); - #define poly_basemul_montgomery_cached \ MLKEM_NAMESPACE(poly_basemul_montgomery_cached) /************************************************* @@ -715,4 +204,56 @@ __contract__( assigns(object_whole(r)) ); +#define poly_ntt MLKEM_NAMESPACE(poly_ntt) +/************************************************* + * Name: poly_ntt + * + * Description: Computes negacyclic number-theoretic transform (NTT) of + * a polynomial in place. + * + * The input is assumed to be in normal order and + * coefficient-wise bound by MLKEM_Q in absolute value. + * + * The output polynomial is in bitreversed order, and + * coefficient-wise bound by NTT_BOUND in absolute value. + * + * (NOTE: Sometimes the input to the NTT is actually smaller, + * which gives better bounds.) + * + * Arguments: - poly *p: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_ntt(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND)) +); + +#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont) +/************************************************* + * Name: poly_invntt_tomont + * + * Description: Computes inverse of negacyclic number-theoretic transform (NTT) + * of a polynomial in place; + * inputs assumed to be in bitreversed order, output in normal + * order + * + * The input is assumed to be in bitreversed order, and can + * have arbitrary coefficients in int16_t. + * + * The output polynomial is in normal order, and + * coefficient-wise bound by INVNTT_BOUND in absolute value. + * + * Arguments: - uint16_t *a: pointer to in/output polynomial + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_invntt_tomont(poly *r) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND)) +); + #endif /* POLY_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly_k.c new file mode 100644 index 000000000..c2d330ea9 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly_k.c @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "poly_k.h" +#include +#include +#include "arith_backend.h" +#include "compress.h" +#include "sampling.h" +#include "symmetric.h" + +#include "debug.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1) +#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2) +/* End of static namespacing */ + +MLKEM_NATIVE_INTERNAL_API +void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], + const polyvec *a) +{ + unsigned i; + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_K; i++) + { + poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_decompress_du(polyvec *r, + const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU); + } + + debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) +{ + unsigned i; + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); + + for (i = 0; i < MLKEM_K; i++) + { + poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES); + } + + debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_ntt(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_ntt(&r->vec[i]); + } + + debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_invntt_tomont(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_invntt_tomont(&r->vec[i]); + } + + debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND); +} + +#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, + const polyvec *b, + const polyvec_mulcache *b_cache) +{ + unsigned i; + poly t; + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); + + poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]); + for (i = 1; i < MLKEM_K; i++) + { + poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i], + &b_cache->vec[i]); + poly_add(r, &t); + } + + /* + * This bound is true for the C implementation, but not needed + * in the higher level bounds reasoning. It is thus omitted + * them from the spec to not unnecessarily constrain native + * implementations, but checked here nonetheless. + */ + debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q); +} +#else /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, + const polyvec *b, + const polyvec_mulcache *b_cache) +{ + debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); + /* Omitting bounds assertion for cache since native implementations may + * decide not to use a mulcache. Note that the C backend implementation + * of poly_basemul_montgomery_cached() does still include the check. */ + polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a, + (const int16_t *)b, + (const int16_t *)b_cache); +} +#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ + +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) +{ + polyvec_mulcache b_cache; + polyvec_mulcache_compute(&b_cache, b); + polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_mulcache_compute(&x->vec[i], &a->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_reduce(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_reduce(&r->vec[i]); + } + + debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_add(polyvec *r, const polyvec *b) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_add(&r->vec[i], &b->vec[i]); + } +} + +MLKEM_NATIVE_INTERNAL_API +void polyvec_tomont(polyvec *r) +{ + unsigned i; + for (i = 0; i < MLKEM_K; i++) + { + poly_tomont(&r->vec[i]); + } + + debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q); +} + + +/************************************************* + * Name: poly_cbd_eta1 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter MLKEM_ETA1. + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +static INLINE void poly_cbd_eta1(poly *r, + const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4]) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1)) +) +{ +#if MLKEM_ETA1 == 2 + poly_cbd2(r, buf); +#elif MLKEM_ETA1 == 3 + poly_cbd3(r, buf); +#else +#error "Invalid value of MLKEM_ETA1" +#endif +} + +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, + uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) +{ + ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1]; + ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1]; + ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1]; + ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1]; + memcpy(extkey0, seed, MLKEM_SYMBYTES); + memcpy(extkey1, seed, MLKEM_SYMBYTES); + memcpy(extkey2, seed, MLKEM_SYMBYTES); + memcpy(extkey3, seed, MLKEM_SYMBYTES); + extkey0[MLKEM_SYMBYTES] = nonce0; + extkey1[MLKEM_SYMBYTES] = nonce1; + extkey2[MLKEM_SYMBYTES] = nonce2; + extkey3[MLKEM_SYMBYTES] = nonce3; + prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3); + poly_cbd_eta1(r0, buf0); + poly_cbd_eta1(r1, buf1); + poly_cbd_eta1(r2, buf2); + poly_cbd_eta1(r3, buf3); + + debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1); +} + +#if MLKEM_K == 2 || MLKEM_K == 4 +/************************************************* + * Name: poly_cbd_eta2 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter MLKEM_ETA2. + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +static INLINE void poly_cbd_eta2(poly *r, + const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4)) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))) +{ +#if MLKEM_ETA2 == 2 + poly_cbd2(r, buf); +#else +#error "Invalid value of MLKEM_ETA2" +#endif +} + +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce) +{ + ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]; + ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1]; + + memcpy(extkey, seed, MLKEM_SYMBYTES); + extkey[MLKEM_SYMBYTES] = nonce; + prf_eta2(buf, extkey); + + poly_cbd_eta2(r, buf); + + debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1); +} +#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ + + +#if MLKEM_K == 2 +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, + uint8_t nonce3) +{ + ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4]; + ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4]; + ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1]; + memcpy(extkey[0], seed, MLKEM_SYMBYTES); + memcpy(extkey[1], seed, MLKEM_SYMBYTES); + memcpy(extkey[2], seed, MLKEM_SYMBYTES); + memcpy(extkey[3], seed, MLKEM_SYMBYTES); + extkey[0][MLKEM_SYMBYTES] = nonce0; + extkey[1][MLKEM_SYMBYTES] = nonce1; + extkey[2][MLKEM_SYMBYTES] = nonce2; + extkey[3][MLKEM_SYMBYTES] = nonce3; + + prf_eta1(buf1[0], extkey[0]); + prf_eta1(buf1[1], extkey[1]); + prf_eta2(buf2[0], extkey[2]); + prf_eta2(buf2[1], extkey[3]); + + poly_cbd_eta1(r0, buf1[0]); + poly_cbd_eta1(r1, buf1[1]); + poly_cbd_eta2(r2, buf2[0]); + poly_cbd_eta2(r3, buf2[1]); + + debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); + debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1); + debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1); +} +#endif /* MLKEM_K == 2 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly_k.h new file mode 100644 index 000000000..0aea95912 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly_k.h @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef POLY_K_H +#define POLY_K_H + +#include +#include "common.h" +#include "compress.h" +#include "poly.h" + +#define polyvec MLKEM_NAMESPACE_K(polyvec) +typedef struct +{ + poly vec[MLKEM_K]; +} ALIGN polyvec; + +#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache) +typedef struct +{ + poly_mulcache vec[MLKEM_K]; +} polyvec_mulcache; + +#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du) +/************************************************* + * Name: poly_compress_du + * + * Description: Compression (du bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], + const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))) +{ +#if MLKEM_DU == 10 + poly_compress_d10(r, a); +#elif MLKEM_DU == 11 + poly_compress_d11(r, a); +#else +#error "Invalid value of MLKEM_DU" +#endif +} + +#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du) +/************************************************* + * Name: poly_decompress_du + * + * Description: De-serialization and subsequent decompression (du bits) of a + * polynomial; approximate inverse of poly_compress_du + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +static INLINE void poly_decompress_du( + poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(memory_slice(r, sizeof(poly))) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +{ +#if MLKEM_DU == 10 + poly_decompress_d10(r, a); +#elif MLKEM_DU == 11 + poly_decompress_d11(r, a); +#else +#error "Invalid value of MLKEM_DU" +#endif +} + +#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv) +/************************************************* + * Name: poly_compress_dv + * + * Description: Compression (dv bits) and subsequent serialization of a + * polynomial + * + * Arguments: - uint8_t *r: pointer to output byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) + * - const poly *a: pointer to input polynomial + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], + const poly *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV)) + requires(memory_no_alias(a, sizeof(poly))) + requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + assigns(object_whole(r))) +{ +#if MLKEM_DV == 4 + poly_compress_d4(r, a); +#elif MLKEM_DV == 5 + poly_compress_d5(r, a); +#else +#error "Invalid value of MLKEM_DV" +#endif +} + + +#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv) +/************************************************* + * Name: poly_decompress_dv + * + * Description: De-serialization and subsequent decompression (dv bits) of a + * polynomial; approximate inverse of poly_compress + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) + * + * Upon return, the coefficients of the output polynomial are unsigned-canonical + * (non-negative and smaller than MLKEM_Q). + * + **************************************************/ +static INLINE void poly_decompress_dv( + poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV)) + requires(memory_no_alias(r, sizeof(poly))) + assigns(object_whole(r)) + ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +{ +#if MLKEM_DV == 4 + poly_decompress_d4(r, a); +#elif MLKEM_DV == 5 + poly_decompress_d5(r, a); +#else +#error "Invalid value of MLKEM_DV" +#endif +} + +#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du) +/************************************************* + * Name: polyvec_compress_du + * + * Description: Compress and serialize vector of polynomials + * + * Arguments: - uint8_t *r: pointer to output byte array + * (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU) + * - const polyvec *a: pointer to input vector of polynomials. + * Coefficients must be unsigned canonical, + * i.e. in [0,1,..,MLKEM_Q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], + const polyvec *a) +__contract__( + requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(a, sizeof(polyvec))) + requires(forall(k0, 0, MLKEM_K, + array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) + assigns(object_whole(r)) +); + +#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du) +/************************************************* + * Name: polyvec_decompress_du + * + * Description: De-serialize and decompress vector of polynomials; + * approximate inverse of polyvec_compress_du + * + * Arguments: - polyvec *r: pointer to output vector of polynomials. + * Output will have coefficients normalized to [0,..,q-1]. + * - const uint8_t *a: pointer to input byte array + * (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU) + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_decompress_du(polyvec *r, + const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) +__contract__( + requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(k0, 0, MLKEM_K, + array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +); + +#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes) +/************************************************* + * Name: polyvec_tobytes + * + * Description: Serialize vector of polynomials + * + * Arguments: - uint8_t *r: pointer to output byte array + * (needs space for MLKEM_POLYVECBYTES) + * - const polyvec *a: pointer to input vector of polynomials + * Each polynomial must have coefficients in [0,..,q-1]. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) +__contract__( + requires(memory_no_alias(a, sizeof(polyvec))) + requires(memory_no_alias(r, MLKEM_POLYVECBYTES)) + requires(forall(k0, 0, MLKEM_K, + array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) + assigns(object_whole(r)) +); + +#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes) +/************************************************* + * Name: polyvec_frombytes + * + * Description: De-serialize vector of polynomials; + * inverse of polyvec_tobytes + * + * Arguments: - const polyvec *a: pointer to output vector of polynomials + * (of length MLKEM_POLYVECBYTES). Output will have coefficients + * normalized in [0..4095]. + * - uint8_t *r: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + requires(memory_no_alias(a, MLKEM_POLYVECBYTES)) + assigns(object_whole(r)) + ensures(forall(k0, 0, MLKEM_K, + array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) +); + +#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt) +/************************************************* + * Name: polyvec_ntt + * + * Description: Apply forward NTT to all elements of a vector of polynomials. + * + * The input is assumed to be in normal order and + * coefficient-wise bound by MLKEM_Q in absolute value. + * + * The output polynomial is in bitreversed order, and + * coefficient-wise bound by NTT_BOUND in absolute value. + * + * Arguments: - polyvec *r: pointer to in/output vector of polynomials + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_ntt(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + requires(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) + assigns(object_whole(r)) + ensures(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND))) +); + +#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont) +/************************************************* + * Name: polyvec_invntt_tomont + * + * Description: Apply inverse NTT to all elements of a vector of polynomials + * and multiply by Montgomery factor 2^16 + * + * The input is assumed to be in bitreversed order, and can + * have arbitrary coefficients in int16_t. + * + * The output polynomial is in normal order, and + * coefficient-wise bound by INVNTT_BOUND in absolute value. + * + * + * Arguments: - polyvec *r: pointer to in/output vector of polynomials + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_invntt_tomont(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND))) +); + +#define polyvec_basemul_acc_montgomery \ + MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery) +/************************************************* + * Name: polyvec_basemul_acc_montgomery + * + * Description: Multiply elements of a and b in NTT domain, accumulate into r, + * and multiply by 2^-16. + * + * Arguments: - poly *r: pointer to output polynomial + * - const polyvec *a: pointer to first input vector of polynomials + * - const polyvec *b: pointer to second input vector of polynomials + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(a, sizeof(polyvec))) + requires(memory_no_alias(b, sizeof(polyvec))) + requires(forall(k1, 0, MLKEM_K, + array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) + assigns(memory_slice(r, sizeof(poly))) +); + + +#define polyvec_basemul_acc_montgomery_cached \ + MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached) +/************************************************* + * Name: polyvec_basemul_acc_montgomery_cached + * + * Description: Scalar product of two vectors of polynomials in NTT domain, + * using mulcache for second operand. + * + * Bounds: + * - Every coefficient of a is assumed to be in [0..4095] + * - No bounds guarantees for the coefficients in the result. + * + * Arguments: - poly *r: pointer to output polynomial + * - const polyvec *a: pointer to first input polynomial vector + * - const polyvec *b: pointer to second input polynomial vector + * - const polyvec_mulcache *b_cache: pointer to mulcache + * for second input polynomial vector. Can be computed + * via polyvec_mulcache_compute(). + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, + const polyvec *b, + const polyvec_mulcache *b_cache) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(a, sizeof(polyvec))) + requires(memory_no_alias(b, sizeof(polyvec))) + requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache))) + requires(forall(k1, 0, MLKEM_K, + array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) + assigns(memory_slice(r, sizeof(poly))) +); + +#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute) +/************************************************************ + * Name: polyvec_mulcache_compute + * + * Description: Computes the mulcache for a vector of polynomials in NTT domain + * + * The mulcache of a degree-2 polynomial b := b0 + b1*X + * in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when + * computing products of b in Fq[X]/(X^2-zeta). + * + * The mulcache of a polynomial in NTT domain -- which is + * a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta), + * for varying zeta, is the 128-tuple of mulcaches of those + * polynomials. + * + * The mulcache of a vector of polynomials is the vector + * of mulcaches of its entries. + * + * Arguments: - x: Pointer to mulcache to be populated + * - a: Pointer to input polynomial vector + ************************************************************/ +/* + * NOTE: The default C implementation of this function populates + * the mulcache with values in (-q,q), but this is not needed for the + * higher level safety proofs, and thus not part of the spec. + */ +MLKEM_NATIVE_INTERNAL_API +void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) +__contract__( + requires(memory_no_alias(x, sizeof(polyvec_mulcache))) + requires(memory_no_alias(a, sizeof(polyvec))) + assigns(object_whole(x)) +); + +#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce) +/************************************************* + * Name: polyvec_reduce + * + * Description: Applies Barrett reduction to each coefficient + * of each element of a vector of polynomials; + * for details of the Barrett reduction see comments in reduce.c + * + * Arguments: - polyvec *r: pointer to input/output polynomial + **************************************************/ +/* + * NOTE: The semantics of polyvec_reduce() is different in + * the reference implementation, which requires + * signed canonical output data. Unsigned canonical + * outputs are better suited to the only remaining + * use of poly_reduce() in the context of (de)serialization. + */ +MLKEM_NATIVE_INTERNAL_API +void polyvec_reduce(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(k0, 0, MLKEM_K, + array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) +); + +#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add) +/************************************************* + * Name: polyvec_add + * + * Description: Add vectors of polynomials + * + * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be + * added to + * - const polyvec *b: pointer to second input vector of polynomials + * + * The coefficients of r and b must be so that the addition does + * not overflow. Otherwise, the behaviour of this function is undefined. + * + * The coefficients returned in *r are in int16_t which is sufficient + * to prove type-safety of calling units. Therefore, no stronger + * ensures clause is required on this function. + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_add(polyvec *r, const polyvec *b) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + requires(memory_no_alias(b, sizeof(polyvec))) + requires(forall(j0, 0, MLKEM_K, + forall(k0, 0, MLKEM_N, + (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX))) + requires(forall(j1, 0, MLKEM_K, + forall(k1, 0, MLKEM_N, + (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN))) + assigns(object_whole(r)) +); + +#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont) +/************************************************* + * Name: polyvec_tomont + * + * Description: Inplace conversion of all coefficients of a polynomial + * vector from normal domain to Montgomery domain + * + * Bounds: Output < q in absolute value. + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void polyvec_tomont(polyvec *r) +__contract__( + requires(memory_no_alias(r, sizeof(polyvec))) + assigns(memory_slice(r, sizeof(polyvec))) + assigns(object_whole(r)) + ensures(forall(j, 0, MLKEM_K, + array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) +); + +#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x) +/************************************************* + * Name: poly_getnoise_eta1_4x + * + * Description: Batch sample four polynomials deterministically from a seed + * and nonces, with output polynomials close to centered binomial distribution + * with parameter MLKEM_ETA1. + * + * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial + * - const uint8_t *seed: pointer to input seed + * (of length MLKEM_SYMBYTES bytes) + * - uint8_t nonce{0,1,2,3}: one-byte input nonce + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, + uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) +/* Depending on MLKEM_K, the pointers passed to this function belong + to the same objects, so we cannot use memory_no_alias for r0-r3. + + NOTE: Somehow it is important to use memory_no_alias() first in the + conjunctions defining each case. +*/ +#if MLKEM_K == 2 +__contract__( + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */ + (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && + r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) + assigns(memory_slice(r0, sizeof(poly))) + assigns(memory_slice(r1, sizeof(poly))) + assigns(memory_slice(r2, sizeof(poly))) + assigns(memory_slice(r3, sizeof(poly))) + ensures( + array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); +); +#elif MLKEM_K == 4 +__contract__( + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + requires( /* Case B: r0, r1, r2, r3 consecutive */ + (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3)) + assigns(memory_slice(r0, sizeof(poly))) + assigns(memory_slice(r1, sizeof(poly))) + assigns(memory_slice(r2, sizeof(poly))) + assigns(memory_slice(r3, sizeof(poly))) + ensures( + array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); +); +#elif MLKEM_K == 3 +__contract__( + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + requires( /* Case C: r0, r1, r2 consecutive */ + (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) && + r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0))) + assigns(memory_slice(r0, sizeof(poly))) + assigns(memory_slice(r1, sizeof(poly))) + assigns(memory_slice(r2, sizeof(poly))) + assigns(memory_slice(r3, sizeof(poly))) + ensures( + array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); +); +#endif /* MLKEM_K */ + +#if MLKEM_ETA1 == MLKEM_ETA2 +/* + * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024 + * where MLKEM_ETA2 = MLKEM_ETA1 = 2. + * For ml-kem-512, poly_getnoise_eta1122_4x is used instead. + */ +#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x +#endif /* MLKEM_ETA1 == MLKEM_ETA2 */ + +#if MLKEM_K == 2 || MLKEM_K == 4 +#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2) +/************************************************* + * Name: poly_getnoise_eta2 + * + * Description: Sample a polynomial deterministically from a seed and a nonce, + * with output polynomial close to centered binomial distribution + * with parameter MLKEM_ETA2 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *seed: pointer to input seed + * (of length MLKEM_SYMBYTES bytes) + * - uint8_t nonce: one-byte input nonce + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce) +__contract__( + requires(memory_no_alias(r, sizeof(poly))) + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + assigns(object_whole(r)) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)) +); +#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ + +#if MLKEM_K == 2 +#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x) +/************************************************* + * Name: poly_getnoise_eta1122_4x + * + * Description: Batch sample four polynomials deterministically from a seed + * and a nonces, with output polynomials close to centered binomial + * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2 + * + * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial + * - const uint8_t *seed: pointer to input seed + * (of length MLKEM_SYMBYTES bytes) + * - uint8_t nonce{0,1,2,3}: one-byte input nonce + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, + const uint8_t seed[MLKEM_SYMBYTES], + uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, + uint8_t nonce3) +__contract__( + requires( /* r0, r1 consecutive, r2, r3 consecutive */ + (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && + r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) + requires(memory_no_alias(seed, MLKEM_SYMBYTES)) + assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3)) + ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) + && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1) + && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)); +); +#endif /* MLKEM_K == 2 */ + +#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.c deleted file mode 100644 index 50ea1c34a..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.c +++ /dev/null @@ -1,330 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "polyvec.h" -#include -#include -#include "arith_backend.h" -#include "cbd.h" -#include "ntt.h" -#include "poly.h" -#include "symmetric.h" - -#include "debug.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1) -#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2) -/* End of static namespacing */ - -MLKEM_NATIVE_INTERNAL_API -void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], - const polyvec *a) -{ - unsigned i; - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_K; i++) - { - poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_decompress_du(polyvec *r, - const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU); - } - - debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) -{ - unsigned i; - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q); - - for (i = 0; i < MLKEM_K; i++) - { - poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES); - } - - debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_ntt(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_ntt(&r->vec[i]); - } - - debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_invntt_tomont(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_invntt_tomont(&r->vec[i]); - } - - debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND); -} - -#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) -{ - unsigned i; - poly t; - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); - - poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]); - for (i = 1; i < MLKEM_K; i++) - { - poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i], - &b_cache->vec[i]); - poly_add(r, &t); - } - - /* - * This bound is true for the C implementation, but not needed - * in the higher level bounds reasoning. It is thus omitted - * them from the spec to not unnecessarily constrain native - * implementations, but checked here nonetheless. - */ - debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q); -} -#else /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) -{ - debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT); - /* Omitting bounds assertion for cache since native implementations may - * decide not to use a mulcache. Note that the C backend implementation - * of poly_basemul_montgomery_cached() does still include the check. */ - polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache); -} -#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ - -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) -{ - polyvec_mulcache b_cache; - polyvec_mulcache_compute(&b_cache, b); - polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_mulcache_compute(&x->vec[i], &a->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_reduce(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_reduce(&r->vec[i]); - } - - debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q); -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_add(polyvec *r, const polyvec *b) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_add(&r->vec[i], &b->vec[i]); - } -} - -MLKEM_NATIVE_INTERNAL_API -void polyvec_tomont(polyvec *r) -{ - unsigned i; - for (i = 0; i < MLKEM_K; i++) - { - poly_tomont(&r->vec[i]); - } - - debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q); -} - - -/************************************************* - * Name: poly_cbd_eta1 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter MLKEM_ETA1. - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -static INLINE void poly_cbd_eta1(poly *r, - const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4]) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1)) -) -{ -#if MLKEM_ETA1 == 2 - poly_cbd2(r, buf); -#elif MLKEM_ETA1 == 3 - poly_cbd3(r, buf); -#else -#error "Invalid value of MLKEM_ETA1" -#endif -} - -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, - uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) -{ - ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1]; - ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1]; - ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1]; - ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1]; - memcpy(extkey0, seed, MLKEM_SYMBYTES); - memcpy(extkey1, seed, MLKEM_SYMBYTES); - memcpy(extkey2, seed, MLKEM_SYMBYTES); - memcpy(extkey3, seed, MLKEM_SYMBYTES); - extkey0[MLKEM_SYMBYTES] = nonce0; - extkey1[MLKEM_SYMBYTES] = nonce1; - extkey2[MLKEM_SYMBYTES] = nonce2; - extkey3[MLKEM_SYMBYTES] = nonce3; - prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3); - poly_cbd_eta1(r0, buf0); - poly_cbd_eta1(r1, buf1); - poly_cbd_eta1(r2, buf2); - poly_cbd_eta1(r3, buf3); - - debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1); -} - -#if MLKEM_K == 2 || MLKEM_K == 4 -/************************************************* - * Name: poly_cbd_eta2 - * - * Description: Given an array of uniformly random bytes, compute - * polynomial with coefficients distributed according to - * a centered binomial distribution with parameter MLKEM_ETA2. - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *buf: pointer to input byte array - **************************************************/ -static INLINE void poly_cbd_eta2(poly *r, - const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4)) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))) -{ -#if MLKEM_ETA2 == 2 - poly_cbd2(r, buf); -#else -#error "Invalid value of MLKEM_ETA2" -#endif -} - -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce) -{ - ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4]; - ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1]; - - memcpy(extkey, seed, MLKEM_SYMBYTES); - extkey[MLKEM_SYMBYTES] = nonce; - prf_eta2(buf, extkey); - - poly_cbd_eta2(r, buf); - - debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1); -} -#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ - - -#if MLKEM_K == 2 -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, - uint8_t nonce3) -{ - ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4]; - ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4]; - ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1]; - memcpy(extkey[0], seed, MLKEM_SYMBYTES); - memcpy(extkey[1], seed, MLKEM_SYMBYTES); - memcpy(extkey[2], seed, MLKEM_SYMBYTES); - memcpy(extkey[3], seed, MLKEM_SYMBYTES); - extkey[0][MLKEM_SYMBYTES] = nonce0; - extkey[1][MLKEM_SYMBYTES] = nonce1; - extkey[2][MLKEM_SYMBYTES] = nonce2; - extkey[3][MLKEM_SYMBYTES] = nonce3; - - prf_eta1(buf1[0], extkey[0]); - prf_eta1(buf1[1], extkey[1]); - prf_eta2(buf2[0], extkey[2]); - prf_eta2(buf2[1], extkey[3]); - - poly_cbd_eta1(r0, buf1[0]); - poly_cbd_eta1(r1, buf1[1]); - poly_cbd_eta2(r2, buf2[0]); - poly_cbd_eta2(r3, buf2[1]); - - debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1); - debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1); - debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1); -} -#endif /* MLKEM_K == 2 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.h deleted file mode 100644 index 8be8579e0..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.h +++ /dev/null @@ -1,595 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef POLYVEC_H -#define POLYVEC_H - -#include -#include "common.h" -#include "poly.h" - -#define polyvec MLKEM_NAMESPACE_K(polyvec) -typedef struct -{ - poly vec[MLKEM_K]; -} ALIGN polyvec; - -#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache) -typedef struct -{ - poly_mulcache vec[MLKEM_K]; -} polyvec_mulcache; - -#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du) -/************************************************* - * Name: poly_compress_du - * - * Description: Compression (du bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], - const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU))) -{ -#if MLKEM_DU == 10 - poly_compress_d10(r, a); -#elif MLKEM_DU == 11 - poly_compress_d11(r, a); -#else -#error "Invalid value of MLKEM_DU" -#endif -} - -#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du) -/************************************************* - * Name: poly_decompress_du - * - * Description: De-serialization and subsequent decompression (du bits) of a - * polynomial; approximate inverse of poly_compress_du - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -static INLINE void poly_decompress_du( - poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(memory_slice(r, sizeof(poly))) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -{ -#if MLKEM_DU == 10 - poly_decompress_d10(r, a); -#elif MLKEM_DU == 11 - poly_decompress_d11(r, a); -#else -#error "Invalid value of MLKEM_DU" -#endif -} - -#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv) -/************************************************* - * Name: poly_compress_dv - * - * Description: Compression (dv bits) and subsequent serialization of a - * polynomial - * - * Arguments: - uint8_t *r: pointer to output byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) - * - const poly *a: pointer to input polynomial - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], - const poly *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV)) - requires(memory_no_alias(a, sizeof(poly))) - requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - assigns(object_whole(r))) -{ -#if MLKEM_DV == 4 - poly_compress_d4(r, a); -#elif MLKEM_DV == 5 - poly_compress_d5(r, a); -#else -#error "Invalid value of MLKEM_DV" -#endif -} - - -#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv) -/************************************************* - * Name: poly_decompress_dv - * - * Description: De-serialization and subsequent decompression (dv bits) of a - * polynomial; approximate inverse of poly_compress - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes) - * - * Upon return, the coefficients of the output polynomial are unsigned-canonical - * (non-negative and smaller than MLKEM_Q). - * - **************************************************/ -static INLINE void poly_decompress_dv( - poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV)) - requires(memory_no_alias(r, sizeof(poly))) - assigns(object_whole(r)) - ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -{ -#if MLKEM_DV == 4 - poly_decompress_d4(r, a); -#elif MLKEM_DV == 5 - poly_decompress_d5(r, a); -#else -#error "Invalid value of MLKEM_DV" -#endif -} - -#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du) -/************************************************* - * Name: polyvec_compress_du - * - * Description: Compress and serialize vector of polynomials - * - * Arguments: - uint8_t *r: pointer to output byte array - * (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU) - * - const polyvec *a: pointer to input vector of polynomials. - * Coefficients must be unsigned canonical, - * i.e. in [0,1,..,MLKEM_Q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU], - const polyvec *a) -__contract__( - requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(a, sizeof(polyvec))) - requires(forall(k0, 0, MLKEM_K, - array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) - assigns(object_whole(r)) -); - -#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du) -/************************************************* - * Name: polyvec_decompress_du - * - * Description: De-serialize and decompress vector of polynomials; - * approximate inverse of polyvec_compress_du - * - * Arguments: - polyvec *r: pointer to output vector of polynomials. - * Output will have coefficients normalized to [0,..,q-1]. - * - const uint8_t *a: pointer to input byte array - * (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU) - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_decompress_du(polyvec *r, - const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU]) -__contract__( - requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU)) - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(k0, 0, MLKEM_K, - array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -); - -#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes) -/************************************************* - * Name: polyvec_tobytes - * - * Description: Serialize vector of polynomials - * - * Arguments: - uint8_t *r: pointer to output byte array - * (needs space for MLKEM_POLYVECBYTES) - * - const polyvec *a: pointer to input vector of polynomials - * Each polynomial must have coefficients in [0,..,q-1]. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a) -__contract__( - requires(memory_no_alias(a, sizeof(polyvec))) - requires(memory_no_alias(r, MLKEM_POLYVECBYTES)) - requires(forall(k0, 0, MLKEM_K, - array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) - assigns(object_whole(r)) -); - -#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes) -/************************************************* - * Name: polyvec_frombytes - * - * Description: De-serialize vector of polynomials; - * inverse of polyvec_tobytes - * - * Arguments: - const polyvec *a: pointer to output vector of polynomials - * (of length MLKEM_POLYVECBYTES). Output will have coefficients - * normalized in [0..4095]. - * - uint8_t *r: pointer to input byte array - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - requires(memory_no_alias(a, MLKEM_POLYVECBYTES)) - assigns(object_whole(r)) - ensures(forall(k0, 0, MLKEM_K, - array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) -); - -#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt) -/************************************************* - * Name: polyvec_ntt - * - * Description: Apply forward NTT to all elements of a vector of polynomials. - * - * The input is assumed to be in normal order and - * coefficient-wise bound by MLKEM_Q in absolute value. - * - * The output polynomial is in bitreversed order, and - * coefficient-wise bound by NTT_BOUND in absolute value. - * - * Arguments: - polyvec *r: pointer to in/output vector of polynomials - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_ntt(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - requires(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) - assigns(object_whole(r)) - ensures(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND))) -); - -#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont) -/************************************************* - * Name: polyvec_invntt_tomont - * - * Description: Apply inverse NTT to all elements of a vector of polynomials - * and multiply by Montgomery factor 2^16 - * - * The input is assumed to be in bitreversed order, and can - * have arbitrary coefficients in int16_t. - * - * The output polynomial is in normal order, and - * coefficient-wise bound by INVNTT_BOUND in absolute value. - * - * - * Arguments: - polyvec *r: pointer to in/output vector of polynomials - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_invntt_tomont(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND))) -); - -#define polyvec_basemul_acc_montgomery \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery) -/************************************************* - * Name: polyvec_basemul_acc_montgomery - * - * Description: Multiply elements of a and b in NTT domain, accumulate into r, - * and multiply by 2^-16. - * - * Arguments: - poly *r: pointer to output polynomial - * - const polyvec *a: pointer to first input vector of polynomials - * - const polyvec *b: pointer to second input vector of polynomials - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(a, sizeof(polyvec))) - requires(memory_no_alias(b, sizeof(polyvec))) - requires(forall(k1, 0, MLKEM_K, - array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) - assigns(memory_slice(r, sizeof(poly))) -); - - -#define polyvec_basemul_acc_montgomery_cached \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached) -/************************************************* - * Name: polyvec_basemul_acc_montgomery_cached - * - * Description: Scalar product of two vectors of polynomials in NTT domain, - * using mulcache for second operand. - * - * Bounds: - * - Every coefficient of a is assumed to be in [0..4095] - * - No bounds guarantees for the coefficients in the result. - * - * Arguments: - poly *r: pointer to output polynomial - * - const polyvec *a: pointer to first input polynomial vector - * - const polyvec *b: pointer to second input polynomial vector - * - const polyvec_mulcache *b_cache: pointer to mulcache - * for second input polynomial vector. Can be computed - * via polyvec_mulcache_compute(). - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, - const polyvec *b, - const polyvec_mulcache *b_cache) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(a, sizeof(polyvec))) - requires(memory_no_alias(b, sizeof(polyvec))) - requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache))) - requires(forall(k1, 0, MLKEM_K, - array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))) - assigns(memory_slice(r, sizeof(poly))) -); - -#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute) -/************************************************************ - * Name: polyvec_mulcache_compute - * - * Description: Computes the mulcache for a vector of polynomials in NTT domain - * - * The mulcache of a degree-2 polynomial b := b0 + b1*X - * in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when - * computing products of b in Fq[X]/(X^2-zeta). - * - * The mulcache of a polynomial in NTT domain -- which is - * a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta), - * for varying zeta, is the 128-tuple of mulcaches of those - * polynomials. - * - * The mulcache of a vector of polynomials is the vector - * of mulcaches of its entries. - * - * Arguments: - x: Pointer to mulcache to be populated - * - a: Pointer to input polynomial vector - ************************************************************/ -/* - * NOTE: The default C implementation of this function populates - * the mulcache with values in (-q,q), but this is not needed for the - * higher level safety proofs, and thus not part of the spec. - */ -MLKEM_NATIVE_INTERNAL_API -void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a) -__contract__( - requires(memory_no_alias(x, sizeof(polyvec_mulcache))) - requires(memory_no_alias(a, sizeof(polyvec))) - assigns(object_whole(x)) -); - -#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce) -/************************************************* - * Name: polyvec_reduce - * - * Description: Applies Barrett reduction to each coefficient - * of each element of a vector of polynomials; - * for details of the Barrett reduction see comments in reduce.c - * - * Arguments: - polyvec *r: pointer to input/output polynomial - **************************************************/ -/* - * NOTE: The semantics of polyvec_reduce() is different in - * the reference implementation, which requires - * signed canonical output data. Unsigned canonical - * outputs are better suited to the only remaining - * use of poly_reduce() in the context of (de)serialization. - */ -MLKEM_NATIVE_INTERNAL_API -void polyvec_reduce(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(k0, 0, MLKEM_K, - array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))) -); - -#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add) -/************************************************* - * Name: polyvec_add - * - * Description: Add vectors of polynomials - * - * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be - * added to - * - const polyvec *b: pointer to second input vector of polynomials - * - * The coefficients of r and b must be so that the addition does - * not overflow. Otherwise, the behaviour of this function is undefined. - * - * The coefficients returned in *r are in int16_t which is sufficient - * to prove type-safety of calling units. Therefore, no stronger - * ensures clause is required on this function. - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_add(polyvec *r, const polyvec *b) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - requires(memory_no_alias(b, sizeof(polyvec))) - requires(forall(j0, 0, MLKEM_K, - forall(k0, 0, MLKEM_N, - (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX))) - requires(forall(j1, 0, MLKEM_K, - forall(k1, 0, MLKEM_N, - (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN))) - assigns(object_whole(r)) -); - -#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont) -/************************************************* - * Name: polyvec_tomont - * - * Description: Inplace conversion of all coefficients of a polynomial - * vector from normal domain to Montgomery domain - * - * Bounds: Output < q in absolute value. - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void polyvec_tomont(polyvec *r) -__contract__( - requires(memory_no_alias(r, sizeof(polyvec))) - assigns(memory_slice(r, sizeof(polyvec))) - assigns(object_whole(r)) - ensures(forall(j, 0, MLKEM_K, - array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q))) -); - -#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x) -/************************************************* - * Name: poly_getnoise_eta1_4x - * - * Description: Batch sample four polynomials deterministically from a seed - * and nonces, with output polynomials close to centered binomial distribution - * with parameter MLKEM_ETA1. - * - * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial - * - const uint8_t *seed: pointer to input seed - * (of length MLKEM_SYMBYTES bytes) - * - uint8_t nonce{0,1,2,3}: one-byte input nonce - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0, - uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) -/* Depending on MLKEM_K, the pointers passed to this function belong - to the same objects, so we cannot use memory_no_alias for r0-r3. - - NOTE: Somehow it is important to use memory_no_alias() first in the - conjunctions defining each case. -*/ -#if MLKEM_K == 2 -__contract__( - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */ - (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && - r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) - assigns(memory_slice(r0, sizeof(poly))) - assigns(memory_slice(r1, sizeof(poly))) - assigns(memory_slice(r2, sizeof(poly))) - assigns(memory_slice(r3, sizeof(poly))) - ensures( - array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); -); -#elif MLKEM_K == 4 -__contract__( - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - requires( /* Case B: r0, r1, r2, r3 consecutive */ - (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3)) - assigns(memory_slice(r0, sizeof(poly))) - assigns(memory_slice(r1, sizeof(poly))) - assigns(memory_slice(r2, sizeof(poly))) - assigns(memory_slice(r3, sizeof(poly))) - ensures( - array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); -); -#elif MLKEM_K == 3 -__contract__( - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - requires( /* Case C: r0, r1, r2 consecutive */ - (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) && - r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0))) - assigns(memory_slice(r0, sizeof(poly))) - assigns(memory_slice(r1, sizeof(poly))) - assigns(memory_slice(r2, sizeof(poly))) - assigns(memory_slice(r3, sizeof(poly))) - ensures( - array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)); -); -#endif /* MLKEM_K */ - -#if MLKEM_ETA1 == MLKEM_ETA2 -/* - * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024 - * where MLKEM_ETA2 = MLKEM_ETA1 = 2. - * For ml-kem-512, poly_getnoise_eta1122_4x is used instead. - */ -#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x -#endif /* MLKEM_ETA1 == MLKEM_ETA2 */ - -#if MLKEM_K == 2 || MLKEM_K == 4 -#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2) -/************************************************* - * Name: poly_getnoise_eta2 - * - * Description: Sample a polynomial deterministically from a seed and a nonce, - * with output polynomial close to centered binomial distribution - * with parameter MLKEM_ETA2 - * - * Arguments: - poly *r: pointer to output polynomial - * - const uint8_t *seed: pointer to input seed - * (of length MLKEM_SYMBYTES bytes) - * - uint8_t nonce: one-byte input nonce - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce) -__contract__( - requires(memory_no_alias(r, sizeof(poly))) - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - assigns(object_whole(r)) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)) -); -#endif /* MLKEM_K == 2 || MLKEM_K == 4 */ - -#if MLKEM_K == 2 -#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x) -/************************************************* - * Name: poly_getnoise_eta1122_4x - * - * Description: Batch sample four polynomials deterministically from a seed - * and a nonces, with output polynomials close to centered binomial - * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2 - * - * Arguments: - poly *r{0,1,2,3}: pointer to output polynomial - * - const uint8_t *seed: pointer to input seed - * (of length MLKEM_SYMBYTES bytes) - * - uint8_t nonce{0,1,2,3}: one-byte input nonce - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t seed[MLKEM_SYMBYTES], - uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, - uint8_t nonce3) -__contract__( - requires( /* r0, r1 consecutive, r2, r3 consecutive */ - (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) && - r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2))) - requires(memory_no_alias(seed, MLKEM_SYMBYTES)) - assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3)) - ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1) - && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1) - && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)); -); -#endif /* MLKEM_K == 2 */ - -#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/reduce.h deleted file mode 100644 index b432a4201..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/reduce.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef REDUCE_H -#define REDUCE_H - -#include -#include "cbmc.h" -#include "common.h" -#include "debug.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16) -#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic) -#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce) -#define fqmul MLKEM_NAMESPACE(fqmul) -#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce) -/* End of static namespacing */ - -#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */ - -/************************************************* - * Name: cast_uint16_to_int16 - * - * Description: Cast uint16 value to int16 - * - * Returns: - * input x in 0 .. 32767: returns value unchanged - * input x in 32768 .. 65535: returns (x - 65536) - **************************************************/ -#ifdef CBMC -#pragma CPROVER check push -#pragma CPROVER check disable "conversion" -#endif -ALWAYS_INLINE -static INLINE int16_t cast_uint16_to_int16(uint16_t x) -{ - /* - * PORTABILITY: This relies on uint16_t -> int16_t - * being implemented as the inverse of int16_t -> uint16_t, - * which is implementation-defined (C99 6.3.1.3 (3)) - * CBMC (correctly) fails to prove this conversion is OK, - * so we have to suppress that check here - */ - return (int16_t)x; -} -#ifdef CBMC -#pragma CPROVER check pop -#endif - -/************************************************* - * Name: montgomery_reduce_generic - * - * Description: Generic Montgomery reduction; given a 32-bit integer a, computes - * 16-bit integer congruent to a * R^-1 mod q, where R=2^16 - * - * Arguments: - int32_t a: input integer to be reduced - * - * Returns: integer congruent to a * R^-1 modulo q, with absolute value - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2 - * - **************************************************/ -ALWAYS_INLINE -static INLINE int16_t montgomery_reduce_generic(int32_t a) -{ - /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */ - const uint32_t QINV = 62209; /* q^-1 mod 2^16 */ - - /* Compute a*q^{-1} mod 2^16 in unsigned representatives */ - const uint16_t a_reduced = a & UINT16_MAX; - const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX; - - /* Lift to signed canonical representative mod 2^16. */ - const int16_t t = cast_uint16_to_int16(a_inverted); - - int32_t r = a - ((int32_t)t * MLKEM_Q); - /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */ - - /* - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - r = r >> 16; - /* Bounds: |r >> 16| <= ceil(|r| / 2^16) - * <= ceil(|a| / 2^16 + MLKEM_Q / 2) - * <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * - * (Note that |a >> n| = ceil(|a| / 2^16) for negative a) - */ - - return (int16_t)r; -} - -/************************************************* - * Name: montgomery_reduce - * - * Description: Montgomery reduction - * - * Arguments: - int32_t a: input integer to be reduced - * Must be smaller than 2 * 2^12 * 2^15 in absolute value. - * - * Returns: integer congruent to a * R^-1 modulo q, - * smaller than 2 * q in absolute value. - **************************************************/ -static INLINE int16_t montgomery_reduce(int32_t a) -__contract__( - requires(a > -(2 * UINT12_LIMIT * 32768)) - requires(a < (2 * UINT12_LIMIT * 32768)) - ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768); - - res = montgomery_reduce_generic(a); - /* Bounds: - * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2 - * <= UINT12_LIMIT + (MLKEM_Q + 1) / 2 - * < 2 * MLKEM_Q */ - - debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q); - return res; -} - -/************************************************* - * Name: fqmul - * - * Description: Montgomery multiplication modulo q=3329 - * - * Arguments: - int16_t a: first factor - * Can be any int16_t. - * - int16_t b: second factor. - * Must be signed canonical (abs value <(q+1)/2) - * - * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and - * smaller than q in absolute value. - * - **************************************************/ -static INLINE int16_t fqmul(int16_t a, int16_t b) -__contract__( - requires(b > -HALF_Q) - requires(b < HALF_Q) - ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q) -) -{ - int16_t res; - debug_assert_abs_bound(&b, 1, HALF_Q); - - res = montgomery_reduce((int32_t)a * (int32_t)b); - /* Bounds: - * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2 - * <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2 - * < MLKEM_Q - */ - - debug_assert_abs_bound(&res, 1, MLKEM_Q); - return res; -} - -/************************************************* - * Name: barrett_reduce - * - * Description: Barrett reduction; given a 16-bit integer a, computes - * centered representative congruent to a mod q in - * {-(q-1)/2,...,(q-1)/2} - * - * Arguments: - int16_t a: input integer to be reduced - * - * Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. - **************************************************/ -static INLINE int16_t barrett_reduce(int16_t a) -__contract__( - ensures(return_value > -HALF_Q && return_value < HALF_Q) -) -{ - /* - * To divide by MLKEM_Q using Barrett multiplication, the "magic number" - * multiplier is round_to_nearest(2**26/MLKEM_Q) - */ - const int BPOWER = 26; - const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q; - - /* - * Compute round_to_nearest(a/MLKEM_Q) using the multiplier - * above and shift by BPOWER places. - * PORTABILITY: Right-shift on a signed integer is, strictly-speaking, - * implementation-defined for negative left argument. Here, - * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5)) - */ - const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER; - - /* - * t is in -10 .. +10, so we need 32-bit math to - * evaluate t * MLKEM_Q and the subsequent subtraction - */ - int16_t res = (int16_t)(a - t * MLKEM_Q); - - debug_assert_abs_bound(&res, 1, HALF_Q); - return res; -} - -#endif diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.c deleted file mode 100644 index cbbe4407f..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.c +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#include "common.h" -#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) - -#include "arith_backend.h" -#include "debug.h" -#include "fips202.h" -#include "fips202x4.h" -#include "rej_uniform.h" -#include "symmetric.h" - -/* Static namespacing - * This is to facilitate building multiple instances - * of mlkem-native (e.g. with varying security levels) - * within a single compilation unit. */ -#define rej_uniform MLKEM_NAMESPACE(rej_uniform) -#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar) -/* End of static namespacing */ - -static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target, - unsigned int offset, const uint8_t *buf, - unsigned int buflen) -__contract__( - requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) - requires(memory_no_alias(r, sizeof(int16_t) * target)) - requires(memory_no_alias(buf, buflen)) - requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * target)) - ensures(offset <= return_value && return_value <= target) - ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) -) -{ - unsigned int ctr, pos; - uint16_t val0, val1; - - debug_assert_bound(r, offset, 0, MLKEM_Q); - - ctr = offset; - pos = 0; - /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */ - while (ctr < target && pos + 3 <= buflen) - __loop__( - invariant(offset <= ctr && ctr <= target && pos <= buflen) - invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q))) - { - val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; - val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; - pos += 3; - - if (val0 < MLKEM_Q) - { - r[ctr++] = val0; - } - if (ctr < target && val1 < MLKEM_Q) - { - r[ctr++] = val1; - } - } - - debug_assert_bound(r, ctr, 0, MLKEM_Q); - return ctr; -} - -#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM) -/************************************************* - * Name: rej_uniform - * - * Description: Run rejection sampling on uniform random bytes to generate - * uniform random integers mod q - * - * Arguments: - int16_t *r: pointer to output buffer - * - unsigned int target: requested number of 16-bit integers - * (uniform mod q). - * Must be <= 4096. - * - unsigned int offset: number of 16-bit integers that have - * already been sampled. - * Must be <= target. - * - const uint8_t *buf: pointer to input buffer - * (assumed to be uniform random bytes) - * - unsigned int buflen: length of input buffer in bytes - * Must be <= 4096. - * Must be a multiple of 3. - * - * Note: Strictly speaking, only a few values of buflen near UINT_MAX need - * excluding. The limit of 4096 is somewhat arbitary but sufficient for all - * uses of this function. Similarly, the actual limit for target is UINT_MAX/2. - * - * Returns the new offset of sampled 16-bit integers, at most target, - * and at least the initial offset. - * If the new offset is strictly less than len, all of the input buffers - * is guaranteed to have been consumed. If it is equal to len, no information - * is provided on how many bytes of the input buffer have been consumed. - **************************************************/ - -/* - * NOTE: The signature differs from the Kyber reference implementation - * in that it adds the offset and always expects the base of the target - * buffer. This avoids shifting the buffer base in the caller, which appears - * tricky to reason about. - */ -static unsigned int rej_uniform(int16_t *r, unsigned int target, - unsigned int offset, const uint8_t *buf, - unsigned int buflen) -__contract__( - requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) - requires(memory_no_alias(r, sizeof(int16_t) * target)) - requires(memory_no_alias(buf, buflen)) - requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) - assigns(memory_slice(r, sizeof(int16_t) * target)) - ensures(offset <= return_value && return_value <= target) - ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) -) -{ - return rej_uniform_scalar(r, target, offset, buf, buflen); -} -#else /* MLKEM_USE_NATIVE_REJ_UNIFORM */ -static unsigned int rej_uniform(int16_t *r, unsigned int target, - unsigned int offset, const uint8_t *buf, - unsigned int buflen) -{ - int ret; - - /* Sample from large buffer with full lane as much as possible. */ - ret = rej_uniform_native(r + offset, target - offset, buf, buflen); - if (ret != -1) - { - unsigned res = offset + (unsigned)ret; - debug_assert_bound(r, res, 0, MLKEM_Q); - return res; - } - - return rej_uniform_scalar(r, target, offset, buf, buflen); -} -#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ - -#ifndef MLKEM_GEN_MATRIX_NBLOCKS -#define MLKEM_GEN_MATRIX_NBLOCKS \ - ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE) -#endif - -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) -{ - /* Temporary buffers for XOF output before rejection sampling */ - uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - - /* Tracks the number of coefficients we have already sampled */ - unsigned int ctr[KECCAK_WAY]; - xof_x4_ctx statex; - unsigned int buflen; - - shake128x4_inc_init(&statex); - - /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */ - xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3], - MLKEM_SYMBYTES + 2); - - /* - * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. - * This should generate the matrix entries with high probability. - */ - xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS, - &statex); - buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; - ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen); - ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen); - ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen); - ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen); - - /* - * So long as not all matrix entries have been generated, squeeze - * one more block a time until we're done. - */ - buflen = XOF_RATE; - while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N || - ctr[3] < MLKEM_N) - __loop__( - assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0), - object_whole(buf1), object_whole(buf2), object_whole(buf3)) - invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N) - invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N) - invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q)) - invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q)) - invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q)) - invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q))) - { - xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex); - ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen); - ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen); - ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen); - ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen); - } - - xof_x4_release(&statex); -} - -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) -{ - xof_ctx state; - uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; - unsigned int ctr, buflen; - - shake128_inc_init(&state); - - xof_absorb(&state, seed, MLKEM_SYMBYTES + 2); - - /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. - */ - /* This should generate the matrix entry with high probability. */ - xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state); - buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; - ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen); - - /* Squeeze + sample one more block a time until we're done */ - buflen = XOF_RATE; - while (ctr < MLKEM_N) - __loop__( - assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf)) - invariant(ctr <= MLKEM_N) - invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q))) - { - xof_squeezeblocks(buf, 1, &state); - ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen); - } - - xof_release(&state); -} - -#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ - -#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform) -int empty_cu_rej_uniform; - -#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.h deleted file mode 100644 index 801287259..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef REJ_UNIFORM_H -#define REJ_UNIFORM_H - -#include -#include -#include "cbmc.h" -#include "common.h" -#include "poly.h" - -#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4) -/************************************************* - * Name: poly_rej_uniform_x4 - * - * Description: Generate four polynomials using rejection sampling - * on (pseudo-)uniformly random bytes sampled from a seed. - * - * Arguments: - poly *vec: Pointer to an array of 4 polynomials - * to be sampled. - * - uint8_t *seed[4]: Pointer to array of four pointers - * pointing to the seed buffers of size - * MLKEM_SYMBYTES + 2 each. - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) -__contract__( - requires(memory_no_alias(vec, sizeof(poly) * 4)) - requires(memory_no_alias(seed, sizeof(uint8_t*) * 4)) - requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2)) - requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2)) - requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2)) - requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2)) - assigns(memory_slice(vec, sizeof(poly) * 4)) - ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) - ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q))); - -#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform) -/************************************************* - * Name: poly_rej_uniform - * - * Description: Generate polynomial using rejection sampling - * on (pseudo-)uniformly random bytes sampled from a seed. - * - * Arguments: - poly *vec: Pointer to polynomial to be sampled. - * - uint8_t *seed: Pointer to seed buffer of size - * MLKEM_SYMBYTES + 2 each. - * - **************************************************/ -MLKEM_NATIVE_INTERNAL_API -void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) -__contract__( - requires(memory_no_alias(entry, sizeof(poly))) - requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2)) - assigns(memory_slice(entry, sizeof(poly))) - ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q))); - -#endif /* REJ_UNIFORM_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/sampling.c new file mode 100644 index 000000000..98cbdcb74 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/sampling.c @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#include "common.h" +#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) + +#include "arith_backend.h" +#include "debug.h" +#include "fips202.h" +#include "fips202x4.h" +#include "sampling.h" +#include "symmetric.h" + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define rej_uniform MLKEM_NAMESPACE(rej_uniform) +#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar) +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) +/* End of static namespacing */ + +static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +__contract__( + requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) + requires(memory_no_alias(r, sizeof(int16_t) * target)) + requires(memory_no_alias(buf, buflen)) + requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * target)) + ensures(offset <= return_value && return_value <= target) + ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) +) +{ + unsigned int ctr, pos; + uint16_t val0, val1; + + debug_assert_bound(r, offset, 0, MLKEM_Q); + + ctr = offset; + pos = 0; + /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */ + while (ctr < target && pos + 3 <= buflen) + __loop__( + invariant(offset <= ctr && ctr <= target && pos <= buflen) + invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q))) + { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; + + if (val0 < MLKEM_Q) + { + r[ctr++] = val0; + } + if (ctr < target && val1 < MLKEM_Q) + { + r[ctr++] = val1; + } + } + + debug_assert_bound(r, ctr, 0, MLKEM_Q); + return ctr; +} + +#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM) +/************************************************* + * Name: rej_uniform + * + * Description: Run rejection sampling on uniform random bytes to generate + * uniform random integers mod q + * + * Arguments: - int16_t *r: pointer to output buffer + * - unsigned int target: requested number of 16-bit integers + * (uniform mod q). + * Must be <= 4096. + * - unsigned int offset: number of 16-bit integers that have + * already been sampled. + * Must be <= target. + * - const uint8_t *buf: pointer to input buffer + * (assumed to be uniform random bytes) + * - unsigned int buflen: length of input buffer in bytes + * Must be <= 4096. + * Must be a multiple of 3. + * + * Note: Strictly speaking, only a few values of buflen near UINT_MAX need + * excluding. The limit of 4096 is somewhat arbitary but sufficient for all + * uses of this function. Similarly, the actual limit for target is UINT_MAX/2. + * + * Returns the new offset of sampled 16-bit integers, at most target, + * and at least the initial offset. + * If the new offset is strictly less than len, all of the input buffers + * is guaranteed to have been consumed. If it is equal to len, no information + * is provided on how many bytes of the input buffer have been consumed. + **************************************************/ + +/* + * NOTE: The signature differs from the Kyber reference implementation + * in that it adds the offset and always expects the base of the target + * buffer. This avoids shifting the buffer base in the caller, which appears + * tricky to reason about. + */ +static unsigned int rej_uniform(int16_t *r, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +__contract__( + requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0) + requires(memory_no_alias(r, sizeof(int16_t) * target)) + requires(memory_no_alias(buf, buflen)) + requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q)) + assigns(memory_slice(r, sizeof(int16_t) * target)) + ensures(offset <= return_value && return_value <= target) + ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q)) +) +{ + return rej_uniform_scalar(r, target, offset, buf, buflen); +} +#else /* MLKEM_USE_NATIVE_REJ_UNIFORM */ +static unsigned int rej_uniform(int16_t *r, unsigned int target, + unsigned int offset, const uint8_t *buf, + unsigned int buflen) +{ + int ret; + + /* Sample from large buffer with full lane as much as possible. */ + ret = rej_uniform_native(r + offset, target - offset, buf, buflen); + if (ret != -1) + { + unsigned res = offset + (unsigned)ret; + debug_assert_bound(r, res, 0, MLKEM_Q); + return res; + } + + return rej_uniform_scalar(r, target, offset, buf, buflen); +} +#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */ + +#ifndef MLKEM_GEN_MATRIX_NBLOCKS +#define MLKEM_GEN_MATRIX_NBLOCKS \ + ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE) +#endif + +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) +{ + /* Temporary buffers for XOF output before rejection sampling */ + uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + + /* Tracks the number of coefficients we have already sampled */ + unsigned int ctr[KECCAK_WAY]; + xof_x4_ctx statex; + unsigned int buflen; + + shake128x4_inc_init(&statex); + + /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */ + xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3], + MLKEM_SYMBYTES + 2); + + /* + * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. + * This should generate the matrix entries with high probability. + */ + xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS, + &statex); + buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; + ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen); + ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen); + ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen); + ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen); + + /* + * So long as not all matrix entries have been generated, squeeze + * one more block a time until we're done. + */ + buflen = XOF_RATE; + while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N || + ctr[3] < MLKEM_N) + __loop__( + assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0), + object_whole(buf1), object_whole(buf2), object_whole(buf3)) + invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N) + invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N) + invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q)) + invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q)) + invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q)) + invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q))) + { + xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex); + ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen); + ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen); + ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen); + ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen); + } + + xof_x4_release(&statex); +} + +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) +{ + xof_ctx state; + uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE]; + unsigned int ctr, buflen; + + shake128_inc_init(&state); + + xof_absorb(&state, seed, MLKEM_SYMBYTES + 2); + + /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS. + */ + /* This should generate the matrix entry with high probability. */ + xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state); + buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE; + ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen); + + /* Squeeze + sample one more block a time until we're done */ + buflen = XOF_RATE; + while (ctr < MLKEM_N) + __loop__( + assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf)) + invariant(ctr <= MLKEM_N) + invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q))) + { + xof_squeezeblocks(buf, 1, &state); + ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen); + } + + xof_release(&state); +} + +/* Static namespacing + * This is to facilitate building multiple instances + * of mlkem-native (e.g. with varying security levels) + * within a single compilation unit. */ +#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian) +#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian) +/* End of static namespacing */ + +/************************************************* + * Name: load32_littleendian + * + * Description: load 4 bytes into a 32-bit integer + * in little-endian order + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x + **************************************************/ +static uint32_t load32_littleendian(const uint8_t x[4]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + r |= (uint32_t)x[3] << 24; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 8; i++) + __loop__( + invariant(i <= MLKEM_N / 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3))) + { + unsigned j; + uint32_t t = load32_littleendian(buf + 4 * i); + uint32_t d = t & 0x55555555; + d += (t >> 1) & 0x55555555; + + for (j = 0; j < 8; j++) + __loop__( + invariant(i <= MLKEM_N / 8 && j <= 8) + invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3))) + { + const int16_t a = (d >> (4 * j + 0)) & 0x3; + const int16_t b = (d >> (4 * j + 2)) & 0x3; + r->coeffs[8 * i + j] = a - b; + } + } +} + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +/************************************************* + * Name: load24_littleendian + * + * Description: load 3 bytes into a 32-bit integer + * in little-endian order. + * This function is only needed for ML-KEM-512 + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) + **************************************************/ +static uint32_t load24_littleendian(const uint8_t x[3]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} + +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]) +{ + unsigned i; + for (i = 0; i < MLKEM_N / 4; i++) + __loop__( + invariant(i <= MLKEM_N / 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4))) + { + unsigned j; + const uint32_t t = load24_littleendian(buf + 3 * i); + uint32_t d = t & 0x00249249; + d += (t >> 1) & 0x00249249; + d += (t >> 2) & 0x00249249; + + for (j = 0; j < 4; j++) + __loop__( + invariant(i <= MLKEM_N / 4 && j <= 4) + invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4))) + { + const int16_t a = (d >> (6 * j + 0)) & 0x7; + const int16_t b = (d >> (6 * j + 3)) & 0x7; + r->coeffs[4 * i + j] = a - b; + } + } +} +#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \ + 3 */ + +#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ + +#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling) +int empty_cu_sampling; + +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/sampling.h new file mode 100644 index 000000000..cc524e0fc --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/sampling.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef SAMPLING_H +#define SAMPLING_H + +#include +#include +#include "cbmc.h" +#include "common.h" +#include "poly.h" + +#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2) +/************************************************* + * Name: poly_cbd2 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=2 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]); + +#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3 +#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3) +/************************************************* + * Name: poly_cbd3 + * + * Description: Given an array of uniformly random bytes, compute + * polynomial with coefficients distributed according to + * a centered binomial distribution with parameter eta=3. + * This function is only needed for ML-KEM-512 + * + * Arguments: - poly *r: pointer to output polynomial + * - const uint8_t *buf: pointer to input byte array + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]); +#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */ + +#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4) +/************************************************* + * Name: poly_rej_uniform_x4 + * + * Description: Generate four polynomials using rejection sampling + * on (pseudo-)uniformly random bytes sampled from a seed. + * + * Arguments: - poly *vec: Pointer to an array of 4 polynomials + * to be sampled. + * - uint8_t *seed[4]: Pointer to array of four pointers + * pointing to the seed buffers of size + * MLKEM_SYMBYTES + 2 each. + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4]) +__contract__( + requires(memory_no_alias(vec, sizeof(poly) * 4)) + requires(memory_no_alias(seed, sizeof(uint8_t*) * 4)) + requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2)) + requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2)) + requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2)) + requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2)) + assigns(memory_slice(vec, sizeof(poly) * 4)) + ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q)) + ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q))); + +#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform) +/************************************************* + * Name: poly_rej_uniform + * + * Description: Generate polynomial using rejection sampling + * on (pseudo-)uniformly random bytes sampled from a seed. + * + * Arguments: - poly *vec: Pointer to polynomial to be sampled. + * - uint8_t *seed: Pointer to seed buffer of size + * MLKEM_SYMBYTES + 2 each. + * + **************************************************/ +MLKEM_NATIVE_INTERNAL_API +void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2]) +__contract__( + requires(memory_no_alias(entry, sizeof(poly))) + requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2)) + assigns(memory_slice(entry, sizeof(poly))) + ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q))); + +#endif /* SAMPLING_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c index 4ef887c62..987f0dce4 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c @@ -10,7 +10,7 @@ #include "common.h" #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED) -#include "ntt.h" +#include "poly.h" /* * Table of zeta values used in the reference NTT and inverse NTT. diff --git a/tests/constant_time/kem/passes/ml_kem b/tests/constant_time/kem/passes/ml_kem index cc4f93e4e..34674562b 100644 --- a/tests/constant_time/kem/passes/ml_kem +++ b/tests/constant_time/kem/passes/ml_kem @@ -12,14 +12,14 @@ fun:PQCP_MLKEM_NATIVE_MLKEM*_dec } { - + Rejection sampling to produce public "A" matrix Memcheck:Cond ... fun:PQCP_MLKEM_NATIVE_MLKEM*_gen_matrix fun:PQCP_MLKEM_NATIVE_MLKEM*_indcpa_* } { - + Rejection sampling to produce public "A" matrix Memcheck:Value8 ... fun:PQCP_MLKEM_NATIVE_MLKEM*_gen_matrix