diff --git a/docs/algorithms/kem/ml_kem.md b/docs/algorithms/kem/ml_kem.md
index 1b9244c41..73b4e0f80 100644
--- a/docs/algorithms/kem/ml_kem.md
+++ b/docs/algorithms/kem/ml_kem.md
@@ -7,7 +7,7 @@
 - **Authors' website**: https://pq-crystals.org/kyber/ and https://csrc.nist.gov/pubs/fips/203
 - **Specification version**: ML-KEM.
 - **Primary Source**<a name="primary-source"></a>:
-  - **Source**: https://github.com/pq-code-package/mlkem-native/commit/21c0c397f243543a9d4334860d9edb1d4e6a6cda
+  - **Source**: https://github.com/pq-code-package/mlkem-native/commit/68a82c658399c470624087b52c6d99032114c0b5
   - **Implementation license (SPDX-Identifier)**: CC0-1.0 or Apache-2.0
 
 
@@ -24,6 +24,8 @@
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?‡   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------|
 | [Primary Source](#primary-source) | ref                      | All                         | All                             | None                    | True                               | True                                           | False                 |
+| [Primary Source](#primary-source) | x86\_64                  | x86\_64                     | Linux,Darwin                    | AVX2,BMI2,POPCNT        | True                               | True                                           | False                 |
+| [Primary Source](#primary-source) | aarch64                  | ARM64\_V8                   | Linux,Darwin                    | None                    | True                               | False                                          | False                 |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -34,6 +36,8 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
 | [Primary Source](#primary-source) | ref                      | All                         | All                             | None                    | True                               | True                                           | False                |
+| [Primary Source](#primary-source) | x86\_64                  | x86\_64                     | Linux,Darwin                    | AVX2,BMI2,POPCNT        | True                               | True                                           | False                |
+| [Primary Source](#primary-source) | aarch64                  | ARM64\_V8                   | Linux,Darwin                    | None                    | True                               | False                                          | False                |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -42,6 +46,8 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
 | [Primary Source](#primary-source) | ref                      | All                         | All                             | None                    | True                               | True                                           | False                |
+| [Primary Source](#primary-source) | x86\_64                  | x86\_64                     | Linux,Darwin                    | AVX2,BMI2,POPCNT        | True                               | True                                           | False                |
+| [Primary Source](#primary-source) | aarch64                  | ARM64\_V8                   | Linux,Darwin                    | None                    | True                               | False                                          | False                |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
diff --git a/docs/algorithms/kem/ml_kem.yml b/docs/algorithms/kem/ml_kem.yml
index b042606ab..82ebeb84a 100644
--- a/docs/algorithms/kem/ml_kem.yml
+++ b/docs/algorithms/kem/ml_kem.yml
@@ -17,7 +17,7 @@ website: https://pq-crystals.org/kyber/ and https://csrc.nist.gov/pubs/fips/203
 nist-round: FIPS203
 spec-version: ML-KEM
 primary-upstream:
-  source: https://github.com/pq-code-package/mlkem-native/commit/21c0c397f243543a9d4334860d9edb1d4e6a6cda
+  source: https://github.com/pq-code-package/mlkem-native/commit/68a82c658399c470624087b52c6d99032114c0b5
   spdx-license-identifier: CC0-1.0 or Apache-2.0
 parameter-sets:
 - name: ML-KEM-512
@@ -37,6 +37,34 @@ parameter-sets:
     no-secret-dependent-branching-claimed: true
     no-secret-dependent-branching-checked-by-valgrind: true
     large-stack-usage: false
+  - upstream: primary-upstream
+    upstream-id: x86_64
+    supported-platforms:
+    - architecture: x86_64
+      operating_systems:
+      - Linux
+      - Darwin
+      required_flags:
+      - avx2
+      - bmi2
+      - popcnt
+    common-crypto:
+    - SHA3: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: true
+    large-stack-usage: false
+  - upstream: primary-upstream
+    upstream-id: aarch64
+    supported-platforms:
+    - architecture: ARM64_V8
+      operating_systems:
+      - Linux
+      - Darwin
+    common-crypto:
+    - SHA3: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: false
+    large-stack-usage: false
 - name: ML-KEM-768
   claimed-nist-level: 3
   claimed-security: IND-CCA2
@@ -54,6 +82,34 @@ parameter-sets:
     no-secret-dependent-branching-claimed: true
     no-secret-dependent-branching-checked-by-valgrind: true
     large-stack-usage: false
+  - upstream: primary-upstream
+    upstream-id: x86_64
+    supported-platforms:
+    - architecture: x86_64
+      operating_systems:
+      - Linux
+      - Darwin
+      required_flags:
+      - avx2
+      - bmi2
+      - popcnt
+    common-crypto:
+    - SHA3: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: true
+    large-stack-usage: false
+  - upstream: primary-upstream
+    upstream-id: aarch64
+    supported-platforms:
+    - architecture: ARM64_V8
+      operating_systems:
+      - Linux
+      - Darwin
+    common-crypto:
+    - SHA3: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: false
+    large-stack-usage: false
 - name: ML-KEM-1024
   claimed-nist-level: 5
   claimed-security: IND-CCA2
@@ -71,3 +127,31 @@ parameter-sets:
     no-secret-dependent-branching-claimed: true
     no-secret-dependent-branching-checked-by-valgrind: true
     large-stack-usage: false
+  - upstream: primary-upstream
+    upstream-id: x86_64
+    supported-platforms:
+    - architecture: x86_64
+      operating_systems:
+      - Linux
+      - Darwin
+      required_flags:
+      - avx2
+      - bmi2
+      - popcnt
+    common-crypto:
+    - SHA3: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: true
+    large-stack-usage: false
+  - upstream: primary-upstream
+    upstream-id: aarch64
+    supported-platforms:
+    - architecture: ARM64_V8
+      operating_systems:
+      - Linux
+      - Darwin
+    common-crypto:
+    - SHA3: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: false
+    large-stack-usage: false
diff --git a/docs/cbom.json b/docs/cbom.json
index a9361e375..48f298f7f 100644
--- a/docs/cbom.json
+++ b/docs/cbom.json
@@ -2,23 +2,23 @@
   "$schema": "https://raw.githubusercontent.com/CycloneDX/specification/1.6/schema/bom-1.6.schema.json",
   "bomFormat": "CycloneDX",
   "specVersion": "1.6",
-  "serialNumber": "urn:uuid:d66add05-17dd-4986-8894-ed47d1e910b6",
+  "serialNumber": "urn:uuid:11c99519-c4e5-4517-8016-4932140dd322",
   "version": 1,
   "metadata": {
-    "timestamp": "2024-12-09T14:24:28.343759+00:00",
+    "timestamp": "2025-01-22T14:42:21.903424+00:00",
     "component": {
       "type": "library",
-      "bom-ref": "pkg:github/open-quantum-safe/liboqs@d0d0413dc9fff538296ab86bac492cb4bf54dedb",
+      "bom-ref": "pkg:github/open-quantum-safe/liboqs@af4928dddde853579f8a16a488cf3e142f177979",
       "name": "liboqs",
-      "version": "d0d0413dc9fff538296ab86bac492cb4bf54dedb"
+      "version": "af4928dddde853579f8a16a488cf3e142f177979"
     }
   },
   "components": [
     {
       "type": "library",
-      "bom-ref": "pkg:github/open-quantum-safe/liboqs@d0d0413dc9fff538296ab86bac492cb4bf54dedb",
+      "bom-ref": "pkg:github/open-quantum-safe/liboqs@af4928dddde853579f8a16a488cf3e142f177979",
       "name": "liboqs",
-      "version": "d0d0413dc9fff538296ab86bac492cb4bf54dedb"
+      "version": "af4928dddde853579f8a16a488cf3e142f177979"
     },
     {
       "type": "cryptographic-asset",
@@ -1060,6 +1060,46 @@
         }
       }
     },
+    {
+      "type": "cryptographic-asset",
+      "bom-ref": "alg:ML-KEM-512:x86_64",
+      "name": "ML-KEM",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "parameterSetIdentifier": "ML-KEM-512",
+          "primitive": "kem",
+          "executionEnvironment": "software-plain-ram",
+          "cryptoFunctions": [
+            "keygen",
+            "encapsulate",
+            "decapsulate"
+          ],
+          "nistQuantumSecurityLevel": 1,
+          "implementationPlatform": "x86_64"
+        }
+      }
+    },
+    {
+      "type": "cryptographic-asset",
+      "bom-ref": "alg:ML-KEM-512:armv8-a",
+      "name": "ML-KEM",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "parameterSetIdentifier": "ML-KEM-512",
+          "primitive": "kem",
+          "executionEnvironment": "software-plain-ram",
+          "cryptoFunctions": [
+            "keygen",
+            "encapsulate",
+            "decapsulate"
+          ],
+          "nistQuantumSecurityLevel": 1,
+          "implementationPlatform": "armv8-a"
+        }
+      }
+    },
     {
       "type": "cryptographic-asset",
       "bom-ref": "alg:ML-KEM-768:generic",
@@ -1080,6 +1120,46 @@
         }
       }
     },
+    {
+      "type": "cryptographic-asset",
+      "bom-ref": "alg:ML-KEM-768:x86_64",
+      "name": "ML-KEM",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "parameterSetIdentifier": "ML-KEM-768",
+          "primitive": "kem",
+          "executionEnvironment": "software-plain-ram",
+          "cryptoFunctions": [
+            "keygen",
+            "encapsulate",
+            "decapsulate"
+          ],
+          "nistQuantumSecurityLevel": 3,
+          "implementationPlatform": "x86_64"
+        }
+      }
+    },
+    {
+      "type": "cryptographic-asset",
+      "bom-ref": "alg:ML-KEM-768:armv8-a",
+      "name": "ML-KEM",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "parameterSetIdentifier": "ML-KEM-768",
+          "primitive": "kem",
+          "executionEnvironment": "software-plain-ram",
+          "cryptoFunctions": [
+            "keygen",
+            "encapsulate",
+            "decapsulate"
+          ],
+          "nistQuantumSecurityLevel": 3,
+          "implementationPlatform": "armv8-a"
+        }
+      }
+    },
     {
       "type": "cryptographic-asset",
       "bom-ref": "alg:ML-KEM-1024:generic",
@@ -1100,6 +1180,46 @@
         }
       }
     },
+    {
+      "type": "cryptographic-asset",
+      "bom-ref": "alg:ML-KEM-1024:x86_64",
+      "name": "ML-KEM",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "parameterSetIdentifier": "ML-KEM-1024",
+          "primitive": "kem",
+          "executionEnvironment": "software-plain-ram",
+          "cryptoFunctions": [
+            "keygen",
+            "encapsulate",
+            "decapsulate"
+          ],
+          "nistQuantumSecurityLevel": 5,
+          "implementationPlatform": "x86_64"
+        }
+      }
+    },
+    {
+      "type": "cryptographic-asset",
+      "bom-ref": "alg:ML-KEM-1024:armv8-a",
+      "name": "ML-KEM",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "parameterSetIdentifier": "ML-KEM-1024",
+          "primitive": "kem",
+          "executionEnvironment": "software-plain-ram",
+          "cryptoFunctions": [
+            "keygen",
+            "encapsulate",
+            "decapsulate"
+          ],
+          "nistQuantumSecurityLevel": 5,
+          "implementationPlatform": "armv8-a"
+        }
+      }
+    },
     {
       "type": "cryptographic-asset",
       "bom-ref": "alg:sntrup761:generic",
@@ -3067,7 +3187,7 @@
   ],
   "dependencies": [
     {
-      "ref": "pkg:github/open-quantum-safe/liboqs@d0d0413dc9fff538296ab86bac492cb4bf54dedb",
+      "ref": "pkg:github/open-quantum-safe/liboqs@af4928dddde853579f8a16a488cf3e142f177979",
       "provides": [
         "alg:BIKE-L1:x86_64",
         "alg:BIKE-L3:x86_64",
@@ -3121,8 +3241,14 @@
         "alg:Kyber1024:x86_64",
         "alg:Kyber1024:armv8-a",
         "alg:ML-KEM-512:generic",
+        "alg:ML-KEM-512:x86_64",
+        "alg:ML-KEM-512:armv8-a",
         "alg:ML-KEM-768:generic",
+        "alg:ML-KEM-768:x86_64",
+        "alg:ML-KEM-768:armv8-a",
         "alg:ML-KEM-1024:generic",
+        "alg:ML-KEM-1024:x86_64",
+        "alg:ML-KEM-1024:armv8-a",
         "alg:sntrup761:generic",
         "alg:sntrup761:x86_64",
         "alg:cross-rsdp-128-balanced:generic",
@@ -3542,18 +3668,54 @@
         "alg:sha3"
       ]
     },
+    {
+      "ref": "alg:ML-KEM-512:x86_64",
+      "dependsOn": [
+        "alg:sha3"
+      ]
+    },
+    {
+      "ref": "alg:ML-KEM-512:armv8-a",
+      "dependsOn": [
+        "alg:sha3"
+      ]
+    },
     {
       "ref": "alg:ML-KEM-768:generic",
       "dependsOn": [
         "alg:sha3"
       ]
     },
+    {
+      "ref": "alg:ML-KEM-768:x86_64",
+      "dependsOn": [
+        "alg:sha3"
+      ]
+    },
+    {
+      "ref": "alg:ML-KEM-768:armv8-a",
+      "dependsOn": [
+        "alg:sha3"
+      ]
+    },
     {
       "ref": "alg:ML-KEM-1024:generic",
       "dependsOn": [
         "alg:sha3"
       ]
     },
+    {
+      "ref": "alg:ML-KEM-1024:x86_64",
+      "dependsOn": [
+        "alg:sha3"
+      ]
+    },
+    {
+      "ref": "alg:ML-KEM-1024:armv8-a",
+      "dependsOn": [
+        "alg:sha3"
+      ]
+    },
     {
       "ref": "alg:sntrup761:generic",
       "dependsOn": [
diff --git a/scripts/copy_from_upstream/copy_from_upstream.py b/scripts/copy_from_upstream/copy_from_upstream.py
index 400ecc57a..1959a4b72 100755
--- a/scripts/copy_from_upstream/copy_from_upstream.py
+++ b/scripts/copy_from_upstream/copy_from_upstream.py
@@ -495,14 +495,24 @@ def handle_implementation(impl, family, scheme, dst_basedir):
         else:
             # determine list of files to copy:
             if 'sources' in i:
+                preserve_folder_structure = ('preserve_folder_structure' in i['upstream']) and i['upstream']['preserve_folder_structure'] == True
                 srcs = i['sources'].split(" ")
                 for s in srcs:
                     # Copy recursively only in case of directories not with plain files to avoid copying over symbolic links
                     if os.path.isfile(os.path.join(origfolder, s)):
-                        subprocess.run(['cp', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))])
+                        if preserve_folder_structure:
+                            subprocess.run(['mkdir', '-p', os.path.join(srcfolder, os.path.dirname(s))])
+                            subprocess.run(['cp', os.path.join(origfolder, s), os.path.join(srcfolder, s)])
+                        else:
+                            subprocess.run(['cp', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))])
+
                     else:
-                        subprocess.run(
-                            ['cp', '-r', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))])
+                        if preserve_folder_structure:
+                            subprocess.run(
+                                ['cp', '-r', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.dirname(s))])                    
+                        else:
+                            subprocess.run(
+                                ['cp', '-r', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))])
             else:
                 subprocess.run(['cp', '-pr', os.path.join(origfolder, '.'), srcfolder])
                 # raise Exception("Malformed YML file: No sources listed to copy. Check upstream YML file." )
diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml
index be9376220..188f9f937 100644
--- a/scripts/copy_from_upstream/copy_from_upstream.yml
+++ b/scripts/copy_from_upstream/copy_from_upstream.yml
@@ -33,11 +33,12 @@ upstreams:
   -
     name: mlkem-native
     git_url: https://github.com/pq-code-package/mlkem-native.git
-    git_branch: main
-    git_commit: 21c0c397f243543a9d4334860d9edb1d4e6a6cda
-    kem_meta_path: '{pretty_name_full}_META.yml'
+    git_branch: updates-8
+    git_commit: 68a82c658399c470624087b52c6d99032114c0b5
+    kem_meta_path: 'integration/liboqs/{pretty_name_full}_META.yml'
     kem_scheme_path: '.'
     patches: [mlkem-native.patch]
+    preserve_folder_structure: True
   -
     name: pqcrystals-dilithium
     git_url: https://github.com/pq-crystals/dilithium.git
diff --git a/scripts/copy_from_upstream/patches/mlkem-native.patch b/scripts/copy_from_upstream/patches/mlkem-native.patch
index 290c3f317..17317fc92 100644
--- a/scripts/copy_from_upstream/patches/mlkem-native.patch
+++ b/scripts/copy_from_upstream/patches/mlkem-native.patch
@@ -1,252 +1,5 @@
-diff --git a/ML-KEM-1024_META.yml b/ML-KEM-1024_META.yml
-new file mode 100644
-index 00000000..62b57bdd
---- /dev/null
-+++ b/ML-KEM-1024_META.yml
-@@ -0,0 +1,63 @@
-+name: ML-KEM-1024
-+type: kem
-+claimed-nist-level: 5
-+claimed-security: IND-CCA2
-+length-public-key: 1568
-+length-ciphertext: 1568
-+length-secret-key: 3168
-+length-shared-secret: 32
-+nistkat-sha256: f580d851e5fb27e6876e5e203fa18be4cdbfd49e05d48fec3d3992c8f43a13e6
-+testvectors-sha256: ff1a854b9b6761a70c65ccae85246fe0596a949e72eae0866a8a2a2d4ea54b10
-+principal-submitters:
-+  - Peter Schwabe
-+auxiliary-submitters:
-+  - Roberto Avanzi
-+  - Joppe Bos
-+  - Léo Ducas
-+  - Eike Kiltz
-+  - Tancrède Lepoint
-+  - Vadim Lyubashevsky
-+  - John M. Schanck
-+  - Gregor Seiler
-+  - Damien Stehlé
-+implementations:
-+  - name: ref
-+    version: FIPS203
-+    folder_name: mlkem
-+    compile_opts: -DMLKEM_K=4 -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_C
-+    signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair
-+    signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc
-+    signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_C_dec
-+    sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h
-+  - name: x86_64
-+    version: FIPS203
-+    folder_name: mlkem
-+    compile_opts: -DMLKEM_K=4 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_DEFAULT
-+    signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_DEFAULT_keypair
-+    signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_DEFAULT_enc
-+    signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_DEFAULT_dec
-+    sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h native/x86_64
-+    supported_platforms:
-+      - architecture: x86_64
-+        operating_systems:
-+          - Linux
-+          - Darwin
-+        required_flags:
-+          - avx2
-+          - bmi2
-+          - popcnt
-+  - name: aarch64
-+    version: FIPS203
-+    folder_name: mlkem
-+    compile_opts: -DMLKEM_K=4 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_OPT
-+    signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_OPT_keypair
-+    signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_OPT_enc
-+    signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_OPT_dec
-+    sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h native/aarch64
-+    supported_platforms:
-+      - architecture: arm_8
-+        operating_systems:
-+            - Linux
-+            - Darwin
-+        required_flags:
-+            - asimd
-\ No newline at end of file
-diff --git a/ML-KEM-512_META.yml b/ML-KEM-512_META.yml
-new file mode 100644
-index 00000000..242503cf
---- /dev/null
-+++ b/ML-KEM-512_META.yml
-@@ -0,0 +1,63 @@
-+name: ML-KEM-512
-+type: kem
-+claimed-nist-level: 1
-+claimed-security: IND-CCA2
-+length-public-key: 800
-+length-ciphertext: 768
-+length-secret-key: 1632
-+length-shared-secret: 32
-+nistkat-sha256: c70041a761e01cd6426fa60e9fd6a4412c2be817386c8d0f3334898082512782
-+testvectors-sha256: 6730bb552c22d9d2176ffb5568e48eb30952cf1f065073ec5f9724f6a3c6ea85
-+principal-submitters:
-+  - Peter Schwabe
-+auxiliary-submitters:
-+  - Roberto Avanzi
-+  - Joppe Bos
-+  - Léo Ducas
-+  - Eike Kiltz
-+  - Tancrède Lepoint
-+  - Vadim Lyubashevsky
-+  - John M. Schanck
-+  - Gregor Seiler
-+  - Damien Stehlé
-+implementations:
-+  - name: ref
-+    version: FIPS203
-+    folder_name: mlkem
-+    compile_opts: -DMLKEM_K=2 -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_C
-+    signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair
-+    signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_C_enc
-+    signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_C_dec
-+    sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h
-+  - name: x86_64
-+    version: FIPS203
-+    folder_name: mlkem
-+    compile_opts: -DMLKEM_K=2 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_X86_64_DEFAULT
-+    signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_DEFAULT_keypair
-+    signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_DEFAULT_enc
-+    signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_DEFAULT_dec
-+    sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h native/x86_64
-+    supported_platforms:
-+      - architecture: x86_64
-+        operating_systems:
-+          - Linux
-+          - Darwin
-+        required_flags:
-+          - avx2
-+          - bmi2
-+          - popcnt
-+  - name: aarch64
-+    version: FIPS203
-+    folder_name: mlkem
-+    compile_opts: -DMLKEM_K=2 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT
-+    signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT_keypair
-+    signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT_enc
-+    signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT_dec
-+    sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h native/aarch64
-+    supported_platforms:
-+      - architecture: arm_8
-+        operating_systems:
-+            - Linux
-+            - Darwin
-+        required_flags:
-+            - asimd
-\ No newline at end of file
-diff --git a/ML-KEM-768_META.yml b/ML-KEM-768_META.yml
-new file mode 100644
-index 00000000..74e23d9a
---- /dev/null
-+++ b/ML-KEM-768_META.yml
-@@ -0,0 +1,63 @@
-+name: ML-KEM-768
-+type: kem
-+claimed-nist-level: 3
-+claimed-security: IND-CCA2
-+length-public-key: 1184
-+length-ciphertext: 1088
-+length-secret-key: 2400
-+length-shared-secret: 32
-+nistkat-sha256: 5352539586b6c3df58be6158a6250aeff402bd73060b0a3de68850ac074c17c3
-+testvectors-sha256: 667c8ca2ca93729c0df6ff24588460bad1bbdbfb64ece0fe8563852a7ff348c6
-+principal-submitters:
-+  - Peter Schwabe
-+auxiliary-submitters:
-+  - Roberto Avanzi
-+  - Joppe Bos
-+  - Léo Ducas
-+  - Eike Kiltz
-+  - Tancrède Lepoint
-+  - Vadim Lyubashevsky
-+  - John M. Schanck
-+  - Gregor Seiler
-+  - Damien Stehlé
-+implementations:
-+  - name: ref
-+    version: FIPS203
-+    folder_name: mlkem
-+    compile_opts: -DMLKEM_K=3 -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_C
-+    signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair
-+    signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_C_enc
-+    signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_C_dec
-+    sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h
-+  - name: x86_64
-+    version: FIPS203
-+    folder_name: mlkem
-+    compile_opts: -DMLKEM_K=3 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_X86_64_DEFAULT
-+    signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_DEFAULT_keypair
-+    signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_DEFAULT_enc
-+    signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_DEFAULT_dec
-+    sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h native/x86_64
-+    supported_platforms:
-+      - architecture: x86_64
-+        operating_systems:
-+          - Linux
-+          - Darwin
-+        required_flags:
-+          - avx2
-+          - bmi2
-+          - popcnt
-+  - name: aarch64
-+    version: FIPS203
-+    folder_name: mlkem
-+    compile_opts: -DMLKEM_K=3 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_OPT
-+    signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_OPT_keypair
-+    signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_OPT_enc
-+    signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_OPT_dec
-+    sources: LICENSE arith_backend.h cbd.c cbd.h cbmc.h common.h config.h debug.c debug.h indcpa.c indcpa.h kem.c kem.h mlkem_native.h ntt.c ntt.h params.h poly.c poly.h polyvec.c polyvec.h reduce.h rej_uniform.c rej_uniform.h symmetric.h sys.h verify.c verify.h zetas.c native/api.h native/default.h native/aarch64
-+    supported_platforms:
-+      - architecture: arm_8
-+        operating_systems:
-+            - Linux
-+            - Darwin
-+        required_flags:
-+            - asimd
-\ No newline at end of file
-diff --git a/mlkem/arith_backend.h b/mlkem/arith_backend.h
-index ade31cda..0543b1bd 100644
---- a/mlkem/arith_backend.h
-+++ b/mlkem/arith_backend.h
-@@ -17,7 +17,7 @@
-  * Keep this _after_ the inclusion of the backend; otherwise,
-  * the sanity checks won't have an effect. */
- #if defined(MLKEM_NATIVE_CHECK_APIS)
--#include "native/api.h"
-+#include "api.h"
- #endif
- #endif
- 
-diff --git a/mlkem/config.h b/mlkem/config.h
-index 24a49709..fa89370c 100644
---- a/mlkem/config.h
-+++ b/mlkem/config.h
-@@ -146,7 +146,7 @@
-  *
-  *****************************************************************************/
- #if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND)
--#define MLKEM_NATIVE_ARITH_BACKEND "native/default.h"
-+#define MLKEM_NATIVE_ARITH_BACKEND "default.h"
- #endif /* MLKEM_NATIVE_ARITH_BACKEND */
- 
- /******************************************************************************
-@@ -159,8 +159,8 @@
-  *              This can be set using CFLAGS.
-  *
-  *****************************************************************************/
--#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_FIPS202_BACKEND)
--#define MLKEM_NATIVE_FIPS202_BACKEND "fips202/native/default.h"
-+#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND)
-+#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h"
- #endif /* MLKEM_NATIVE_FIPS202_BACKEND */
- 
- /*************************  Config internals  ********************************/
 diff --git a/mlkem/indcpa.c b/mlkem/indcpa.c
-index 390cc6f2..0cfcc3e9 100644
+index fdca7caf..318d0fc7 100644
 --- a/mlkem/indcpa.c
 +++ b/mlkem/indcpa.c
 @@ -6,8 +6,8 @@
@@ -258,458 +11,25 @@ index 390cc6f2..0cfcc3e9 100644
 +#include "fips202.h"
 +#include "fips202x4.h"
  #include "indcpa.h"
- #include "ntt.h"
  #include "poly.h"
-diff --git a/mlkem/native/aarch64/clean.h b/mlkem/native/aarch64/clean.h
-index f124702a..43a401df 100644
---- a/mlkem/native/aarch64/clean.h
-+++ b/mlkem/native/aarch64/clean.h
-@@ -19,6 +19,6 @@
- /* Filename of the C backend implementation.
-  * This is not inlined here because this header is included in assembly
-  * files as well. */
--#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/clean_impl.h"
-+#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/clean_impl.h"
- 
- #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */
-diff --git a/mlkem/native/aarch64/opt.h b/mlkem/native/aarch64/opt.h
-index a7217163..04323c3e 100644
---- a/mlkem/native/aarch64/opt.h
-+++ b/mlkem/native/aarch64/opt.h
-@@ -19,6 +19,6 @@
- /* Filename of the C backend implementation.
-  * This is not inlined here because this header is included in assembly
-  * files as well. */
--#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/opt_impl.h"
-+#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/opt_impl.h"
- 
- #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */
-diff --git a/mlkem/native/aarch64/src/aarch64_zetas.c b/mlkem/native/aarch64/src/aarch64_zetas.c
-index b3a6f198..1e189fd9 100644
---- a/mlkem/native/aarch64/src/aarch64_zetas.c
-+++ b/mlkem/native/aarch64/src/aarch64_zetas.c
-@@ -8,7 +8,7 @@
-  *          Do not modify it directly.
-  */
- 
--#include "../../../common.h"
-+#include "common.h"
- 
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
-     defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
-diff --git a/mlkem/native/aarch64/src/arith_native_aarch64.h b/mlkem/native/aarch64/src/arith_native_aarch64.h
-index a784a302..fc4e7dd3 100644
---- a/mlkem/native/aarch64/src/arith_native_aarch64.h
-+++ b/mlkem/native/aarch64/src/arith_native_aarch64.h
-@@ -6,7 +6,7 @@
- #define MLKEM_AARCH64_NATIVE_H
- 
- #include <stdint.h>
--#include "../../../common.h"
-+#include "common.h"
- 
- #define aarch64_ntt_zetas_layer01234 \
-   MLKEM_NAMESPACE(aarch64_ntt_zetas_layer01234)
-diff --git a/mlkem/native/aarch64/src/clean_impl.h b/mlkem/native/aarch64/src/clean_impl.h
-index 805adef1..548b1eeb 100644
---- a/mlkem/native/aarch64/src/clean_impl.h
-+++ b/mlkem/native/aarch64/src/clean_impl.h
-@@ -12,8 +12,8 @@
- 
- #include "arith_native_aarch64.h"
- 
--#include "../../../poly.h"
--#include "../../../polyvec.h"
-+#include "poly.h"
-+#include "polyvec.h"
- 
- /* Set of primitives that this backend replaces */
- #define MLKEM_USE_NATIVE_NTT
-diff --git a/mlkem/native/aarch64/src/consts.h b/mlkem/native/aarch64/src/consts.h
-index e3ea26a2..c4094729 100644
---- a/mlkem/native/aarch64/src/consts.h
-+++ b/mlkem/native/aarch64/src/consts.h
-@@ -7,7 +7,7 @@
- #define MLKEM_NATIVE_AARCH64_CONSTS
- 
- #include <stdint.h>
--#include "../../../common.h"
-+#include "common.h"
- 
- #define zetas_mulcache_native MLKEM_NAMESPACE(zetas_mulcache_native)
- extern const int16_t zetas_mulcache_native[256];
-diff --git a/mlkem/native/aarch64/src/intt_clean.S b/mlkem/native/aarch64/src/intt_clean.S
-index 28ad3897..b243a569 100644
---- a/mlkem/native/aarch64/src/intt_clean.S
-+++ b/mlkem/native/aarch64/src/intt_clean.S
-@@ -23,7 +23,7 @@
- /// SOFTWARE.
- ///
- 
--#include "../../../common.h"
-+#include "common.h"
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
- 
- // Bounds:
-diff --git a/mlkem/native/aarch64/src/intt_opt.S b/mlkem/native/aarch64/src/intt_opt.S
-index 857c729c..c94746e1 100644
---- a/mlkem/native/aarch64/src/intt_opt.S
-+++ b/mlkem/native/aarch64/src/intt_opt.S
-@@ -23,7 +23,7 @@
- /// SOFTWARE.
- ///
- 
--#include "../../../common.h"
-+#include "common.h"
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
- 
- // Bounds:
-diff --git a/mlkem/native/aarch64/src/ntt_clean.S b/mlkem/native/aarch64/src/ntt_clean.S
-index 30fdc76b..cd63cc4d 100644
---- a/mlkem/native/aarch64/src/ntt_clean.S
-+++ b/mlkem/native/aarch64/src/ntt_clean.S
-@@ -24,7 +24,7 @@
- /// SOFTWARE.
- ///
- 
--#include "../../../common.h"
-+#include "common.h"
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
- 
- // Bounds:
-diff --git a/mlkem/native/aarch64/src/ntt_opt.S b/mlkem/native/aarch64/src/ntt_opt.S
-index 431f9dc6..8705615b 100644
---- a/mlkem/native/aarch64/src/ntt_opt.S
-+++ b/mlkem/native/aarch64/src/ntt_opt.S
-@@ -24,7 +24,7 @@
- /// SOFTWARE.
- ///
- 
--#include "../../../common.h"
-+#include "common.h"
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
- 
- // Bounds:
-diff --git a/mlkem/native/aarch64/src/opt_impl.h b/mlkem/native/aarch64/src/opt_impl.h
-index b92f3adf..ec1bf658 100644
---- a/mlkem/native/aarch64/src/opt_impl.h
-+++ b/mlkem/native/aarch64/src/opt_impl.h
-@@ -12,8 +12,8 @@
- 
- #include "arith_native_aarch64.h"
- 
--#include "../../../poly.h"
--#include "../../../polyvec.h"
-+#include "poly.h"
-+#include "polyvec.h"
- 
- /* Set of primitives that this backend replaces */
- #define MLKEM_USE_NATIVE_NTT
-diff --git a/mlkem/native/aarch64/src/poly_clean.S b/mlkem/native/aarch64/src/poly_clean.S
-index f3ee0796..809f9667 100644
---- a/mlkem/native/aarch64/src/poly_clean.S
-+++ b/mlkem/native/aarch64/src/poly_clean.S
-@@ -3,7 +3,7 @@
-  * SPDX-License-Identifier: Apache-2.0
-  */
- 
--#include "../../../common.h"
-+#include "common.h"
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
- 
- /*
-diff --git a/mlkem/native/aarch64/src/poly_opt.S b/mlkem/native/aarch64/src/poly_opt.S
-index 555c60a6..815a9dd1 100644
---- a/mlkem/native/aarch64/src/poly_opt.S
-+++ b/mlkem/native/aarch64/src/poly_opt.S
-@@ -3,7 +3,7 @@
-  * SPDX-License-Identifier: Apache-2.0
-  */
- 
--#include "../../../common.h"
-+#include "common.h"
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
- 
- /*
-diff --git a/mlkem/native/aarch64/src/polyvec_clean.S b/mlkem/native/aarch64/src/polyvec_clean.S
-index 0b6df634..c91675b4 100644
---- a/mlkem/native/aarch64/src/polyvec_clean.S
-+++ b/mlkem/native/aarch64/src/polyvec_clean.S
-@@ -9,7 +9,7 @@
- // https://eprint.iacr.org/2021/986
- // https://github.com/neon-ntt/neon-ntt
- 
--#include "../../../common.h"
-+#include "common.h"
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
- 
- // Input:
-diff --git a/mlkem/native/aarch64/src/polyvec_opt.S b/mlkem/native/aarch64/src/polyvec_opt.S
-index 7a27fda3..8300b682 100644
---- a/mlkem/native/aarch64/src/polyvec_opt.S
-+++ b/mlkem/native/aarch64/src/polyvec_opt.S
-@@ -9,7 +9,7 @@
- // https://eprint.iacr.org/2021/986
- // https://github.com/neon-ntt/neon-ntt
- 
--#include "../../../common.h"
-+#include "common.h"
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
- 
- // Input:
-diff --git a/mlkem/native/aarch64/src/rej_uniform_asm_clean.S b/mlkem/native/aarch64/src/rej_uniform_asm_clean.S
-index 9158d6c8..5151a05d 100644
---- a/mlkem/native/aarch64/src/rej_uniform_asm_clean.S
-+++ b/mlkem/native/aarch64/src/rej_uniform_asm_clean.S
-@@ -18,7 +18,7 @@
-  *
-  * Returns number of sampled 16-bit integers (at most MLKEM_N).
-  **************************************************/
--#include "../../../common.h"
-+#include "common.h"
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
-     defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
- 
-diff --git a/mlkem/native/aarch64/src/rej_uniform_table.c b/mlkem/native/aarch64/src/rej_uniform_table.c
-index 29cdbe95..50766034 100644
---- a/mlkem/native/aarch64/src/rej_uniform_table.c
-+++ b/mlkem/native/aarch64/src/rej_uniform_table.c
-@@ -8,7 +8,7 @@
-  *          Do not modify it directly.
-  */
- 
--#include "../../../common.h"
-+#include "common.h"
- 
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
-     defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
-diff --git a/mlkem/native/api.h b/mlkem/native/api.h
-index 5732b97c..792ecb8a 100644
---- a/mlkem/native/api.h
-+++ b/mlkem/native/api.h
-@@ -23,8 +23,8 @@
- #define MLKEM_NATIVE_ARITH_NATIVE_API_H
- 
- #include <stdint.h>
--#include "../poly.h"
--#include "../polyvec.h"
-+#include "poly.h"
-+#include "polyvec.h"
- 
- /*
-  * This is the C<->native interface allowing for the drop-in of
-diff --git a/mlkem/native/default.h b/mlkem/native/default.h
-index f9fe4310..d1e41c52 100644
---- a/mlkem/native/default.h
-+++ b/mlkem/native/default.h
-@@ -8,7 +8,7 @@
- /*
-  * Default arithmetic backend
-  */
--#include "../sys.h"
-+#include "sys.h"
- 
- #ifdef SYS_AARCH64
- /*
-diff --git a/mlkem/native/x86_64/default.h b/mlkem/native/x86_64/default.h
-index 73f53dc1..592e8996 100644
---- a/mlkem/native/x86_64/default.h
-+++ b/mlkem/native/x86_64/default.h
-@@ -19,6 +19,6 @@
- /* Filename of the C backend implementation.
-  * This is not inlined here because this header is included in assembly
-  * files as well. */
--#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/x86_64/src/default_impl.h"
-+#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "x86_64/src/default_impl.h"
- 
- #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */
-diff --git a/mlkem/native/x86_64/src/arith_native_x86_64.h b/mlkem/native/x86_64/src/arith_native_x86_64.h
-index acf3ae56..25e00a93 100644
---- a/mlkem/native/x86_64/src/arith_native_x86_64.h
-+++ b/mlkem/native/x86_64/src/arith_native_x86_64.h
-@@ -5,11 +5,11 @@
- #ifndef MLKEM_X86_64_NATIVE_H
- #define MLKEM_X86_64_NATIVE_H
- 
--#include "../../../common.h"
-+#include "common.h"
- 
- #include <immintrin.h>
- #include <stdint.h>
--#include "../../../polyvec.h"
-+#include "polyvec.h"
- #include "consts.h"
- 
- #define REJ_UNIFORM_AVX_NBLOCKS 3 /* See MLKEM_GEN_MATRIX_NBLOCKS */
-diff --git a/mlkem/native/x86_64/src/basemul.S b/mlkem/native/x86_64/src/basemul.S
-index 5fdc3d0a..b97840e7 100644
---- a/mlkem/native/x86_64/src/basemul.S
-+++ b/mlkem/native/x86_64/src/basemul.S
-@@ -6,7 +6,7 @@
- // Implementation from Kyber reference repository
- // https://github.com/pq-crystals/kyber/blob/main/avx2
- 
--#include "../../../common.h"
-+#include "common.h"
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
- 
- #include "consts.h"
-diff --git a/mlkem/native/x86_64/src/basemul.c b/mlkem/native/x86_64/src/basemul.c
-index 8a23ddcc..5f9ae99c 100644
---- a/mlkem/native/x86_64/src/basemul.c
-+++ b/mlkem/native/x86_64/src/basemul.c
-@@ -3,12 +3,12 @@
-  * SPDX-License-Identifier: Apache-2.0
-  */
- 
--#include "../../../common.h"
-+#include "common.h"
- 
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
- 
--#include "../../../poly.h"
--#include "../../../polyvec.h"
-+#include "poly.h"
-+#include "polyvec.h"
- 
- #include "arith_native_x86_64.h"
- #include "consts.h"
-diff --git a/mlkem/native/x86_64/src/consts.c b/mlkem/native/x86_64/src/consts.c
-index 568752ae..86a0835e 100644
---- a/mlkem/native/x86_64/src/consts.c
-+++ b/mlkem/native/x86_64/src/consts.c
-@@ -8,7 +8,7 @@
-  * https://github.com/pq-crystals/kyber/blob/main/avx2/consts.c
-  */
- 
--#include "../../../common.h"
-+#include "common.h"
- 
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
- 
-diff --git a/mlkem/native/x86_64/src/consts.h b/mlkem/native/x86_64/src/consts.h
-index e2846b60..00c41595 100644
---- a/mlkem/native/x86_64/src/consts.h
-+++ b/mlkem/native/x86_64/src/consts.h
-@@ -11,7 +11,7 @@
- #ifndef CONSTS_H
- #define CONSTS_H
- 
--#include "../../../common.h"
-+#include "common.h"
- 
- #define AVX2_BACKEND_DATA_OFFSET_16XQ 0
- #define AVX2_BACKEND_DATA_OFFSET_16XQINV 16
-diff --git a/mlkem/native/x86_64/src/default_impl.h b/mlkem/native/x86_64/src/default_impl.h
-index cdbd44da..029111c1 100644
---- a/mlkem/native/x86_64/src/default_impl.h
-+++ b/mlkem/native/x86_64/src/default_impl.h
-@@ -12,8 +12,8 @@
- 
- #include <string.h>
- 
--#include "../../../poly.h"
--#include "../../../polyvec.h"
-+#include "poly.h"
-+#include "polyvec.h"
- #include "arith_native_x86_64.h"
- 
- #define MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
-diff --git a/mlkem/native/x86_64/src/fq.S b/mlkem/native/x86_64/src/fq.S
-index 3f013a5f..134bd4f7 100644
---- a/mlkem/native/x86_64/src/fq.S
-+++ b/mlkem/native/x86_64/src/fq.S
-@@ -11,7 +11,7 @@
- //   in [0,1,...,q-1] rather than [0,1,...,q], matching the
- //   semantics of poly_reduce().
- 
--#include "../../../common.h"
-+#include "common.h"
- 
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
- #include "consts.h"
-diff --git a/mlkem/native/x86_64/src/intt.S b/mlkem/native/x86_64/src/intt.S
-index 7b1f2262..6b1d78ef 100644
---- a/mlkem/native/x86_64/src/intt.S
-+++ b/mlkem/native/x86_64/src/intt.S
-@@ -9,7 +9,7 @@
-  * Changes to placement of modular reductions have
-  * been made to simplify reasoning of non-overflow */
- 
--#include "../../../common.h"
-+#include "common.h"
- 
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
- 
-diff --git a/mlkem/native/x86_64/src/ntt.S b/mlkem/native/x86_64/src/ntt.S
-index 5d928b4c..e8bf7894 100644
---- a/mlkem/native/x86_64/src/ntt.S
-+++ b/mlkem/native/x86_64/src/ntt.S
-@@ -6,7 +6,7 @@
- // Implementation from Kyber reference repository
- // https://github.com/pq-crystals/kyber/blob/main/avx2
- 
--#include "../../../common.h"
-+#include "common.h"
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
- 
- #include "consts.h"
-diff --git a/mlkem/native/x86_64/src/rej_uniform_avx2.c b/mlkem/native/x86_64/src/rej_uniform_avx2.c
-index adf2d338..54037a0d 100644
---- a/mlkem/native/x86_64/src/rej_uniform_avx2.c
-+++ b/mlkem/native/x86_64/src/rej_uniform_avx2.c
-@@ -8,7 +8,7 @@
-  * https://github.com/pq-crystals/kyber/blob/main/avx2
-  */
- 
--#include "../../../common.h"
-+#include "common.h"
- 
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
- 
-diff --git a/mlkem/native/x86_64/src/rej_uniform_table.c b/mlkem/native/x86_64/src/rej_uniform_table.c
-index e95fd9e7..9bbc4714 100644
---- a/mlkem/native/x86_64/src/rej_uniform_table.c
-+++ b/mlkem/native/x86_64/src/rej_uniform_table.c
-@@ -8,7 +8,7 @@
-  *          Do not modify it directly.
-  */
- 
--#include "../../../common.h"
-+#include "common.h"
- 
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
- 
-diff --git a/mlkem/native/x86_64/src/shuffle.S b/mlkem/native/x86_64/src/shuffle.S
-index 9bcd0489..5e708748 100644
---- a/mlkem/native/x86_64/src/shuffle.S
-+++ b/mlkem/native/x86_64/src/shuffle.S
-@@ -6,7 +6,7 @@
- // Implementation from Kyber reference repository
- // https://github.com/pq-crystals/kyber/blob/main/avx2
- 
--#include "../../../common.h"
-+#include "common.h"
- 
- #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
- 
+ #include "poly_k.h"
 diff --git a/mlkem/poly.c b/mlkem/poly.c
-index 26b358a2..7483ebf6 100644
+index 2bf10fd0..e8a2e2c6 100644
 --- a/mlkem/poly.c
 +++ b/mlkem/poly.c
-@@ -11,7 +11,7 @@
- #include "cbd.h"
+@@ -10,7 +10,7 @@
+ #include "arith_backend.h"
  #include "cbmc.h"
  #include "debug.h"
 -#include "fips202/fips202x4.h"
 +#include "fips202x4.h"
- #include "ntt.h"
  #include "poly.h"
- #include "reduce.h"
-diff --git a/mlkem/rej_uniform.c b/mlkem/rej_uniform.c
-index 626a440e..cbbe4407 100644
---- a/mlkem/rej_uniform.c
-+++ b/mlkem/rej_uniform.c
+ #include "sampling.h"
+ #include "symmetric.h"
+diff --git a/mlkem/sampling.c b/mlkem/sampling.c
+index 3402ab25..98cbdcb7 100644
+--- a/mlkem/sampling.c
++++ b/mlkem/sampling.c
 @@ -7,8 +7,8 @@
  
  #include "arith_backend.h"
@@ -718,10 +38,10 @@ index 626a440e..cbbe4407 100644
 -#include "fips202/fips202x4.h"
 +#include "fips202.h"
 +#include "fips202x4.h"
- #include "rej_uniform.h"
+ #include "sampling.h"
  #include "symmetric.h"
  
-@@ -155,6 +155,8 @@ void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+@@ -157,6 +157,8 @@ void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
    xof_x4_ctx statex;
    unsigned int buflen;
  
@@ -730,7 +50,7 @@ index 626a440e..cbbe4407 100644
    /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
    xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
                  MLKEM_SYMBYTES + 2);
-@@ -205,6 +207,8 @@ void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+@@ -207,6 +209,8 @@ void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
    uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
    unsigned int ctr, buflen;
  
diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt
index fc2655ddf..102e8993c 100644
--- a/src/kem/ml_kem/CMakeLists.txt
+++ b/src/kem/ml_kem/CMakeLists.txt
@@ -6,7 +6,7 @@
 set(_ML_KEM_OBJS "")
 
 if(OQS_ENABLE_KEM_ml_kem_512)
-    add_library(ml_kem_512_ref OBJECT kem_ml_kem_512.c mlkem-native_ml-kem-512_ref/cbd.c mlkem-native_ml-kem-512_ref/debug.c mlkem-native_ml-kem-512_ref/indcpa.c mlkem-native_ml-kem-512_ref/kem.c mlkem-native_ml-kem-512_ref/ntt.c mlkem-native_ml-kem-512_ref/poly.c mlkem-native_ml-kem-512_ref/polyvec.c mlkem-native_ml-kem-512_ref/rej_uniform.c mlkem-native_ml-kem-512_ref/verify.c mlkem-native_ml-kem-512_ref/zetas.c)
+    add_library(ml_kem_512_ref OBJECT kem_ml_kem_512.c mlkem-native_ml-kem-512_ref/compress.c mlkem-native_ml-kem-512_ref/debug.c mlkem-native_ml-kem-512_ref/indcpa.c mlkem-native_ml-kem-512_ref/kem.c mlkem-native_ml-kem-512_ref/poly.c mlkem-native_ml-kem-512_ref/poly_k.c mlkem-native_ml-kem-512_ref/sampling.c mlkem-native_ml-kem-512_ref/verify.c mlkem-native_ml-kem-512_ref/zetas.c)
     target_compile_options(ml_kem_512_ref PUBLIC -DMLKEM_K=2 -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_C)
     target_include_directories(ml_kem_512_ref PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_ref)
     target_include_directories(ml_kem_512_ref PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
@@ -15,24 +15,24 @@ if(OQS_ENABLE_KEM_ml_kem_512)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_512_x86_64)
-    add_library(ml_kem_512_x86_64 OBJECT mlkem-native_ml-kem-512_x86_64/cbd.c mlkem-native_ml-kem-512_x86_64/debug.c mlkem-native_ml-kem-512_x86_64/indcpa.c mlkem-native_ml-kem-512_x86_64/kem.c mlkem-native_ml-kem-512_x86_64/ntt.c mlkem-native_ml-kem-512_x86_64/poly.c mlkem-native_ml-kem-512_x86_64/polyvec.c mlkem-native_ml-kem-512_x86_64/rej_uniform.c mlkem-native_ml-kem-512_x86_64/verify.c mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.c mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.S mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.c mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.S mlkem-native_ml-kem-512_x86_64/x86_64/src/intt.S mlkem-native_ml-kem-512_x86_64/x86_64/src/ntt.S mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.S mlkem-native_ml-kem-512_x86_64/zetas.c)
+    add_library(ml_kem_512_x86_64 OBJECT mlkem-native_ml-kem-512_x86_64/compress.c mlkem-native_ml-kem-512_x86_64/debug.c mlkem-native_ml-kem-512_x86_64/indcpa.c mlkem-native_ml-kem-512_x86_64/kem.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S mlkem-native_ml-kem-512_x86_64/poly.c mlkem-native_ml-kem-512_x86_64/poly_k.c mlkem-native_ml-kem-512_x86_64/sampling.c mlkem-native_ml-kem-512_x86_64/verify.c mlkem-native_ml-kem-512_x86_64/zetas.c)
     target_include_directories(ml_kem_512_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_x86_64)
     target_include_directories(ml_kem_512_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     target_compile_options(ml_kem_512_x86_64 PRIVATE  -mavx2  -mbmi2  -mpopcnt )
-    target_compile_options(ml_kem_512_x86_64 PUBLIC -DMLKEM_K=2 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_X86_64_DEFAULT)
+    target_compile_options(ml_kem_512_x86_64 PUBLIC -DMLKEM_K=2 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_X86_64_DEFAULT)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_512_x86_64>)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_512_aarch64)
-    add_library(ml_kem_512_aarch64 OBJECT mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-512_aarch64/cbd.c mlkem-native_ml-kem-512_aarch64/debug.c mlkem-native_ml-kem-512_aarch64/indcpa.c mlkem-native_ml-kem-512_aarch64/kem.c mlkem-native_ml-kem-512_aarch64/ntt.c mlkem-native_ml-kem-512_aarch64/poly.c mlkem-native_ml-kem-512_aarch64/polyvec.c mlkem-native_ml-kem-512_aarch64/rej_uniform.c mlkem-native_ml-kem-512_aarch64/verify.c mlkem-native_ml-kem-512_aarch64/zetas.c)
+    add_library(ml_kem_512_aarch64 OBJECT mlkem-native_ml-kem-512_aarch64/compress.c mlkem-native_ml-kem-512_aarch64/debug.c mlkem-native_ml-kem-512_aarch64/indcpa.c mlkem-native_ml-kem-512_aarch64/kem.c mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-512_aarch64/poly.c mlkem-native_ml-kem-512_aarch64/poly_k.c mlkem-native_ml-kem-512_aarch64/sampling.c mlkem-native_ml-kem-512_aarch64/verify.c mlkem-native_ml-kem-512_aarch64/zetas.c)
     target_include_directories(ml_kem_512_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_aarch64)
     target_include_directories(ml_kem_512_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
-    target_compile_options(ml_kem_512_aarch64 PUBLIC -DMLKEM_K=2 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT)
+    target_compile_options(ml_kem_512_aarch64 PUBLIC -DMLKEM_K=2 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_512_aarch64>)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_768)
-    add_library(ml_kem_768_ref OBJECT kem_ml_kem_768.c mlkem-native_ml-kem-768_ref/cbd.c mlkem-native_ml-kem-768_ref/debug.c mlkem-native_ml-kem-768_ref/indcpa.c mlkem-native_ml-kem-768_ref/kem.c mlkem-native_ml-kem-768_ref/ntt.c mlkem-native_ml-kem-768_ref/poly.c mlkem-native_ml-kem-768_ref/polyvec.c mlkem-native_ml-kem-768_ref/rej_uniform.c mlkem-native_ml-kem-768_ref/verify.c mlkem-native_ml-kem-768_ref/zetas.c)
+    add_library(ml_kem_768_ref OBJECT kem_ml_kem_768.c mlkem-native_ml-kem-768_ref/compress.c mlkem-native_ml-kem-768_ref/debug.c mlkem-native_ml-kem-768_ref/indcpa.c mlkem-native_ml-kem-768_ref/kem.c mlkem-native_ml-kem-768_ref/poly.c mlkem-native_ml-kem-768_ref/poly_k.c mlkem-native_ml-kem-768_ref/sampling.c mlkem-native_ml-kem-768_ref/verify.c mlkem-native_ml-kem-768_ref/zetas.c)
     target_compile_options(ml_kem_768_ref PUBLIC -DMLKEM_K=3 -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_C)
     target_include_directories(ml_kem_768_ref PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_ref)
     target_include_directories(ml_kem_768_ref PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
@@ -41,24 +41,24 @@ if(OQS_ENABLE_KEM_ml_kem_768)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_768_x86_64)
-    add_library(ml_kem_768_x86_64 OBJECT mlkem-native_ml-kem-768_x86_64/cbd.c mlkem-native_ml-kem-768_x86_64/debug.c mlkem-native_ml-kem-768_x86_64/indcpa.c mlkem-native_ml-kem-768_x86_64/kem.c mlkem-native_ml-kem-768_x86_64/ntt.c mlkem-native_ml-kem-768_x86_64/poly.c mlkem-native_ml-kem-768_x86_64/polyvec.c mlkem-native_ml-kem-768_x86_64/rej_uniform.c mlkem-native_ml-kem-768_x86_64/verify.c mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.c mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.S mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.c mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.S mlkem-native_ml-kem-768_x86_64/x86_64/src/intt.S mlkem-native_ml-kem-768_x86_64/x86_64/src/ntt.S mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.S mlkem-native_ml-kem-768_x86_64/zetas.c)
+    add_library(ml_kem_768_x86_64 OBJECT mlkem-native_ml-kem-768_x86_64/compress.c mlkem-native_ml-kem-768_x86_64/debug.c mlkem-native_ml-kem-768_x86_64/indcpa.c mlkem-native_ml-kem-768_x86_64/kem.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S mlkem-native_ml-kem-768_x86_64/poly.c mlkem-native_ml-kem-768_x86_64/poly_k.c mlkem-native_ml-kem-768_x86_64/sampling.c mlkem-native_ml-kem-768_x86_64/verify.c mlkem-native_ml-kem-768_x86_64/zetas.c)
     target_include_directories(ml_kem_768_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_x86_64)
     target_include_directories(ml_kem_768_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     target_compile_options(ml_kem_768_x86_64 PRIVATE  -mavx2  -mbmi2  -mpopcnt )
-    target_compile_options(ml_kem_768_x86_64 PUBLIC -DMLKEM_K=3 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_X86_64_DEFAULT)
+    target_compile_options(ml_kem_768_x86_64 PUBLIC -DMLKEM_K=3 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_X86_64_DEFAULT)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_768_x86_64>)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_768_aarch64)
-    add_library(ml_kem_768_aarch64 OBJECT mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-768_aarch64/cbd.c mlkem-native_ml-kem-768_aarch64/debug.c mlkem-native_ml-kem-768_aarch64/indcpa.c mlkem-native_ml-kem-768_aarch64/kem.c mlkem-native_ml-kem-768_aarch64/ntt.c mlkem-native_ml-kem-768_aarch64/poly.c mlkem-native_ml-kem-768_aarch64/polyvec.c mlkem-native_ml-kem-768_aarch64/rej_uniform.c mlkem-native_ml-kem-768_aarch64/verify.c mlkem-native_ml-kem-768_aarch64/zetas.c)
+    add_library(ml_kem_768_aarch64 OBJECT mlkem-native_ml-kem-768_aarch64/compress.c mlkem-native_ml-kem-768_aarch64/debug.c mlkem-native_ml-kem-768_aarch64/indcpa.c mlkem-native_ml-kem-768_aarch64/kem.c mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-768_aarch64/poly.c mlkem-native_ml-kem-768_aarch64/poly_k.c mlkem-native_ml-kem-768_aarch64/sampling.c mlkem-native_ml-kem-768_aarch64/verify.c mlkem-native_ml-kem-768_aarch64/zetas.c)
     target_include_directories(ml_kem_768_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_aarch64)
     target_include_directories(ml_kem_768_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
-    target_compile_options(ml_kem_768_aarch64 PUBLIC -DMLKEM_K=3 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_OPT)
+    target_compile_options(ml_kem_768_aarch64 PUBLIC -DMLKEM_K=3 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_OPT)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_768_aarch64>)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_1024)
-    add_library(ml_kem_1024_ref OBJECT kem_ml_kem_1024.c mlkem-native_ml-kem-1024_ref/cbd.c mlkem-native_ml-kem-1024_ref/debug.c mlkem-native_ml-kem-1024_ref/indcpa.c mlkem-native_ml-kem-1024_ref/kem.c mlkem-native_ml-kem-1024_ref/ntt.c mlkem-native_ml-kem-1024_ref/poly.c mlkem-native_ml-kem-1024_ref/polyvec.c mlkem-native_ml-kem-1024_ref/rej_uniform.c mlkem-native_ml-kem-1024_ref/verify.c mlkem-native_ml-kem-1024_ref/zetas.c)
+    add_library(ml_kem_1024_ref OBJECT kem_ml_kem_1024.c mlkem-native_ml-kem-1024_ref/compress.c mlkem-native_ml-kem-1024_ref/debug.c mlkem-native_ml-kem-1024_ref/indcpa.c mlkem-native_ml-kem-1024_ref/kem.c mlkem-native_ml-kem-1024_ref/poly.c mlkem-native_ml-kem-1024_ref/poly_k.c mlkem-native_ml-kem-1024_ref/sampling.c mlkem-native_ml-kem-1024_ref/verify.c mlkem-native_ml-kem-1024_ref/zetas.c)
     target_compile_options(ml_kem_1024_ref PUBLIC -DMLKEM_K=4 -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_C)
     target_include_directories(ml_kem_1024_ref PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_ref)
     target_include_directories(ml_kem_1024_ref PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
@@ -67,19 +67,19 @@ if(OQS_ENABLE_KEM_ml_kem_1024)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_1024_x86_64)
-    add_library(ml_kem_1024_x86_64 OBJECT mlkem-native_ml-kem-1024_x86_64/cbd.c mlkem-native_ml-kem-1024_x86_64/debug.c mlkem-native_ml-kem-1024_x86_64/indcpa.c mlkem-native_ml-kem-1024_x86_64/kem.c mlkem-native_ml-kem-1024_x86_64/ntt.c mlkem-native_ml-kem-1024_x86_64/poly.c mlkem-native_ml-kem-1024_x86_64/polyvec.c mlkem-native_ml-kem-1024_x86_64/rej_uniform.c mlkem-native_ml-kem-1024_x86_64/verify.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/intt.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/ntt.S mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.S mlkem-native_ml-kem-1024_x86_64/zetas.c)
+    add_library(ml_kem_1024_x86_64 OBJECT mlkem-native_ml-kem-1024_x86_64/compress.c mlkem-native_ml-kem-1024_x86_64/debug.c mlkem-native_ml-kem-1024_x86_64/indcpa.c mlkem-native_ml-kem-1024_x86_64/kem.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S mlkem-native_ml-kem-1024_x86_64/poly.c mlkem-native_ml-kem-1024_x86_64/poly_k.c mlkem-native_ml-kem-1024_x86_64/sampling.c mlkem-native_ml-kem-1024_x86_64/verify.c mlkem-native_ml-kem-1024_x86_64/zetas.c)
     target_include_directories(ml_kem_1024_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_x86_64)
     target_include_directories(ml_kem_1024_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
     target_compile_options(ml_kem_1024_x86_64 PRIVATE  -mavx2  -mbmi2  -mpopcnt )
-    target_compile_options(ml_kem_1024_x86_64 PUBLIC -DMLKEM_K=4 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_DEFAULT)
+    target_compile_options(ml_kem_1024_x86_64 PUBLIC -DMLKEM_K=4 -DFORCE_X86_64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=X86_64_DEFAULT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_DEFAULT)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_1024_x86_64>)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_1024_aarch64)
-    add_library(ml_kem_1024_aarch64 OBJECT mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_aarch64/cbd.c mlkem-native_ml-kem-1024_aarch64/debug.c mlkem-native_ml-kem-1024_aarch64/indcpa.c mlkem-native_ml-kem-1024_aarch64/kem.c mlkem-native_ml-kem-1024_aarch64/ntt.c mlkem-native_ml-kem-1024_aarch64/poly.c mlkem-native_ml-kem-1024_aarch64/polyvec.c mlkem-native_ml-kem-1024_aarch64/rej_uniform.c mlkem-native_ml-kem-1024_aarch64/verify.c mlkem-native_ml-kem-1024_aarch64/zetas.c)
+    add_library(ml_kem_1024_aarch64 OBJECT mlkem-native_ml-kem-1024_aarch64/compress.c mlkem-native_ml-kem-1024_aarch64/debug.c mlkem-native_ml-kem-1024_aarch64/indcpa.c mlkem-native_ml-kem-1024_aarch64/kem.c mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_aarch64/poly.c mlkem-native_ml-kem-1024_aarch64/poly_k.c mlkem-native_ml-kem-1024_aarch64/sampling.c mlkem-native_ml-kem-1024_aarch64/verify.c mlkem-native_ml-kem-1024_aarch64/zetas.c)
     target_include_directories(ml_kem_1024_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_aarch64)
     target_include_directories(ml_kem_1024_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
-    target_compile_options(ml_kem_1024_aarch64 PUBLIC -DMLKEM_K=4 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_OPT)
+    target_compile_options(ml_kem_1024_aarch64 PUBLIC -DMLKEM_K=4 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_OPT)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_1024_aarch64>)
 endif()
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/arith_backend.h
index 0543b1bd1..ade31cda1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/arith_backend.h
@@ -17,7 +17,7 @@
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
 #if defined(MLKEM_NATIVE_CHECK_APIS)
-#include "api.h"
+#include "native/api.h"
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.c
deleted file mode 100644
index 1e6b7c5d1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
-
-#include <stdint.h>
-#include "cbd.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
-#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-/* End of static namespacing */
-
-/*************************************************
- * Name:        load32_littleendian
- *
- * Description: load 4 bytes into a 32-bit integer
- *              in little-endian order
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x
- **************************************************/
-static uint32_t load32_littleendian(const uint8_t x[4])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  r |= (uint32_t)x[3] << 24;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
-  {
-    unsigned j;
-    uint32_t t = load32_littleendian(buf + 4 * i);
-    uint32_t d = t & 0x55555555;
-    d += (t >> 1) & 0x55555555;
-
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
-    {
-      const int16_t a = (d >> (4 * j + 0)) & 0x3;
-      const int16_t b = (d >> (4 * j + 2)) & 0x3;
-      r->coeffs[8 * i + j] = a - b;
-    }
-  }
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 4)
-    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
-  {
-    unsigned j;
-    const uint32_t t = load24_littleendian(buf + 3 * i);
-    uint32_t d = t & 0x00249249;
-    d += (t >> 1) & 0x00249249;
-    d += (t >> 2) & 0x00249249;
-
-    for (j = 0; j < 4; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 4 && j <= 4)
-      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
-    {
-      const int16_t a = (d >> (6 * j + 0)) & 0x7;
-      const int16_t b = (d >> (6 * j + 3)) & 0x7;
-      r->coeffs[4 * i + j] = a - b;
-    }
-  }
-}
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
-          3 */
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
-int empty_cu_cbd;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.h
deleted file mode 100644
index 54c1f5b90..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/cbd.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef CBD_H
-#define CBD_H
-
-#include <stdint.h>
-#include "common.h"
-#include "poly.h"
-
-#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
-/*************************************************
- * Name:        poly_cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
-/*************************************************
- * Name:        poly_cbd3
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
-
-#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h
index 4f326333e..62ed53ab1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h
@@ -15,12 +15,19 @@
 #include "sys.h"
 
 /* Include backend metadata */
-#if defined(MLKEM_USE_NATIVE)
-#if defined(MLKEM_NATIVE_ARITH_BACKEND)
-#include MLKEM_NATIVE_ARITH_BACKEND
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#include MLKEM_NATIVE_ARITH_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not.
+#endif
 #endif
-#if defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#include MLKEM_NATIVE_FIPS202_BACKEND
+
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#include MLKEM_NATIVE_FIPS202_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not.
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/compress.c
new file mode 100644
index 000000000..a03fe0ac4
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/compress.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
+#include <stdint.h>
+#include <string.h>
+#include "arith_backend.h"
+#include "cbmc.h"
+#include "compress.h"
+#include "debug.h"
+#include "verify.h"
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
+    {
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
+    }
+
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(invariant(j <= MLKEM_N / 4))
+  {
+    unsigned k;
+    uint16_t t[4];
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(forall(r, 0, k, t[r] < (1u << 10))))
+    {
+      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 10-bit in size.
+     */
+    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
+    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
+    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
+    r[5 * j + 4] = (t[3] >> 2);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  {
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 4)
+    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[4];
+    uint8_t const *base = &a[5 * j];
+
+    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
+    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
+    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
+
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 32)))
+    {
+      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
+    }
+
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC, and use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     */
+    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
+    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
+    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
+  {
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    uint8_t t[8];
+    const unsigned offset = i * 5;
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC and unwind loop for ease
+     * of proof.
+     */
+
+    /*
+     * Decompress 5 8-bit bytes (so 40 bits) into
+     * 8 5-bit values stored in t[]
+     */
+    t[0] = 0x1F & (a[offset + 0] >> 0);
+    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
+    t[2] = 0x1F & (a[offset + 1] >> 2);
+    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
+    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
+    t[5] = 0x1F & (a[offset + 3] >> 1);
+    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
+    t[7] = 0x1F & (a[offset + 4] >> 3);
+
+    /* and copy to the correct slice in r[] */
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(j <= 8 && i <= MLKEM_N / 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(invariant(i <= MLKEM_N / 2))
+  {
+    const uint16_t t0 = a->coeffs[2 * i];
+    const uint16_t t1 = a->coeffs[2 * i + 1];
+    /*
+     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
+     * significant data, so these can be packed into 24 bits or exactly
+     * 3 bytes, as follows.
+     */
+
+    /* Least significant bits 0 - 7 of t0. */
+    r[3 * i + 0] = t0 & 0xFF;
+
+    /*
+     * Most significant bits 8 - 11 of t0 become the least significant
+     * nibble of the second byte. The least significant 4 bits
+     * of t1 become the upper nibble of the second byte.
+     */
+    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+
+    /* Bits 4 - 11 of t1 become the third byte. */
+    r[3 * i + 2] = t1 >> 4;
+  }
+}
+#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  poly_tobytes_native(r, a->coeffs);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
+  {
+    const uint8_t t0 = a[3 * i + 0];
+    const uint8_t t1 = a[3 * i + 1];
+    const uint8_t t2 = a[3 * i + 2];
+    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
+    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+  }
+
+  /* Note that the coefficients are not canonical */
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  poly_frombytes_native(r->coeffs, a);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+{
+  unsigned i;
+#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
+#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
+#endif
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <  MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      /* Prevent the compiler from recognizing this as a bit selection */
+      uint8_t mask = value_barrier_u8(1u << j);
+      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
+    }
+  }
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    msg[i] = 0;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8))
+    {
+      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
+      msg[i] |= t << j;
+    }
+  }
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress)
+int empty_cu_compress;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/compress.h
new file mode 100644
index 000000000..409dbe519
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/compress.h
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef COMPRESS_H
+#define COMPRESS_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+#include "debug.h"
+#include "poly.h"
+#include "verify.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
+#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
+#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
+#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
+#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
+#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
+#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
+#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
+#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
+/* End of static namespacing */
+
+/************************************************************
+ * Name: scalar_compress_d1
+ *
+ * Description: Computes round(u * 2 / q)
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 1.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d1(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 2)
+  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
+{
+  uint32_t d0 = u << 1;
+  d0 *= 645083;
+  d0 += 1u << 30;
+  d0 >>= 31;
+  return d0;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_compress_d4
+ *
+ * Description: Computes round(u * 16 / q) % 16
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d4(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 16)
+  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
+{
+  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
+  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d4
+ *
+ * Description: Computes round(u * q / 16)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d4(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 16)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 8) / 16; }
+
+/************************************************************
+ * Name: scalar_compress_d5
+ *
+ * Description: Computes round(u * 32 / q) % 32
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d5(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 32)
+  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
+{
+  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
+  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d5
+ *
+ * Description: Computes round(u * q / 32)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 32
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d5(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 32)
+  ensures(return_value <= MLKEM_Q - 1)
+) { return ((u * MLKEM_Q) + 16) / 32; }
+
+/************************************************************
+ * Name: scalar_compress_d10
+ *
+ * Description: Computes round(u * 2**10 / q) % 2**10
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d10(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 10))
+  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
+{
+  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x3FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d10
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d10(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 1024)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 512) / 1024; }
+
+/************************************************************
+ * Name: scalar_compress_d11
+ *
+ * Description: Computes round(u * 2**11 / q) % 2**11
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 11.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d11(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 11))
+  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
+{
+  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x7FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d11
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d11(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 2048)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 1024) / 2048; }
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
+/*************************************************
+ * Name:        poly_compress_d4
+ *
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
+
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
+/*************************************************
+ * Name:        poly_decompress_d4
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
+/*************************************************
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
+
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
+/*************************************************
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
+
+#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
+/*************************************************
+ * Name:        poly_tobytes
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range [0,1,..,Q-1]
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYBYTES))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r))
+);
+
+
+#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
+/*************************************************
+ * Name:        poly_frombytes
+ *
+ * Description: De-serialization of a polynomial.
+ *
+ * Arguments:   INPUT
+ *              - a: pointer to input byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ *              OUTPUT
+ *              - r: pointer to output polynomial, with
+ *                   each coefficient unsigned and in the range
+ *                   0 .. 4095
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
+);
+
+
+#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
+/*************************************************
+ * Name:        poly_frommsg
+ *
+ * Description: Convert 32-byte message to polynomial
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *msg: pointer to input message
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
+#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
+/*************************************************
+ * Name:        poly_tomsg
+ *
+ * Description: Convert polynomial to 32-byte message
+ *
+ * Arguments:   - uint8_t *msg: pointer to output message
+ *              - const poly *r: pointer to input polynomial
+ *                Coefficients must be unsigned canonical
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(msg))
+);
+
+#endif /* COMPRESS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h
index fa89370ce..e975ede95 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h
@@ -122,46 +122,87 @@
 /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
- * Name:        MLKEM_USE_NATIVE
+ * Name:        MLKEM_USE_NATIVE_BACKEND_ARITH
  *
- * Description: Determines whether a native backend should
- *              be used, if available.
+ * Description: Determines whether an native arithmetic backend should be used.
+ *
+ *              The arithmetic backend covers performance critical functions
+ *              such as the number-theoretic transform (NTT).
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the arithmetic backend to be use is
+ *              determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(MLKEM_USE_NATIVE)
-/* #define MLKEM_USE_NATIVE */
+#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */
 #endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_ARITH_BACKEND
+ * Name:        MLKEM_NATIVE_ARITH_BACKEND_FILE
  *
  * Description: The arithmetic backend to use.
  *
- *              This must be the filename of an arithmetic backend.
- *              See the existing backends for examples.
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option
+ *              is ignored.
+ *
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must
+ *              either be undefined or the filename of an arithmetic backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND)
-#define MLKEM_NATIVE_ARITH_BACKEND "default.h"
-#endif /* MLKEM_NATIVE_ARITH_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \
+    !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h"
+#endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_FIPS202_BACKEND
+ * Name:        MLKEM_USE_NATIVE_BACKEND_FIPS202
+ *
+ * Description: Determines whether an native FIPS202 backend should be used.
+ *
+ *              The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is
+ *              the performance bottleneck of SHA3 and SHAKE.
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the FIPS202 backend to be use is
+ *              determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */
+#endif
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_FIPS202_BACKEND_FILE
  *
  * Description: The FIPS-202 backend to use.
  *
- *              This must be the filename of an FIPS-202 backend.
+ *              If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must
+ *              either be undefined or the filename of a FIPS202 backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h"
-#endif /* MLKEM_NATIVE_FIPS202_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \
+    !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h"
+#endif
 
 /*************************  Config internals  ********************************/
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.c
index 0cfcc3e9e..318d0fc77 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.c
@@ -9,11 +9,10 @@
 #include "fips202.h"
 #include "fips202x4.h"
 #include "indcpa.h"
-#include "ntt.h"
 #include "poly.h"
-#include "polyvec.h"
+#include "poly_k.h"
 #include "randombytes.h"
-#include "rej_uniform.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 #include "arith_backend.h"
@@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v,
 #define poly_permute_bitrev_to_custom \
   MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
-static INLINE void poly_permute_bitrev_to_custom(poly *data)
+static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
 __contract__(
   /* We don't specify that this should be a permutation, but only
    * that it does not change the bound established at the end of gen_matrix. */
-  requires(memory_no_alias(data, sizeof(poly)))
-  requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
+  requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
   assigns(memory_slice(data, sizeof(poly)))
-  ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
+  ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
 #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
 
 /* Not static for benchmarking */
@@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
   {
     for (j = 0; j < MLKEM_K; j++)
     {
-      poly_permute_bitrev_to_custom(&a[i].vec[j]);
+      poly_permute_bitrev_to_custom(a[i].vec[j].coeffs);
     }
   }
 }
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.h
index 2c4fda3c4..b4d5985bf 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/indcpa.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "polyvec.h"
+#include "poly_k.h"
 
 #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/README.md
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/README.md
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/README.md
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/clean.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/clean.h
similarity index 90%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/clean.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/clean.h
index 43a401dfc..f124702a4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/clean.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/clean.h
@@ -19,6 +19,6 @@
 /* Filename of the C backend implementation.
  * This is not inlined here because this header is included in assembly
  * files as well. */
-#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/clean_impl.h"
+#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/clean_impl.h"
 
 #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/opt.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/opt.h
similarity index 91%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/opt.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/opt.h
index 04323c3e7..a7217163f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/opt.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/opt.h
@@ -19,6 +19,6 @@
 /* Filename of the C backend implementation.
  * This is not inlined here because this header is included in assembly
  * files as well. */
-#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/opt_impl.h"
+#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/opt_impl.h"
 
 #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c
index 1e189fd99..b3a6f198f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c
@@ -8,7 +8,7 @@
  *          Do not modify it directly.
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
     defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/arith_native_aarch64.h
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/arith_native_aarch64.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/arith_native_aarch64.h
index fc4e7dd38..a784a3027 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/arith_native_aarch64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/arith_native_aarch64.h
@@ -6,7 +6,7 @@
 #define MLKEM_AARCH64_NATIVE_H
 
 #include <stdint.h>
-#include "common.h"
+#include "../../../common.h"
 
 #define aarch64_ntt_zetas_layer01234 \
   MLKEM_NAMESPACE(aarch64_ntt_zetas_layer01234)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/clean_impl.h
similarity index 58%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/clean_impl.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/clean_impl.h
index 548b1eebb..ded7d067a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/clean_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/clean_impl.h
@@ -12,9 +12,6 @@
 
 #include "arith_native_aarch64.h"
 
-#include "poly.h"
-#include "polyvec.h"
-
 /* Set of primitives that this backend replaces */
 #define MLKEM_USE_NATIVE_NTT
 #define MLKEM_USE_NATIVE_INTT
@@ -25,45 +22,46 @@
 #define MLKEM_USE_NATIVE_POLY_TOBYTES
 #define MLKEM_USE_NATIVE_REJ_UNIFORM
 
-static INLINE void ntt_native(poly *data)
+static INLINE void ntt_native(int16_t data[MLKEM_N])
 {
-  ntt_asm_clean(data->coeffs, aarch64_ntt_zetas_layer01234,
-                aarch64_ntt_zetas_layer56);
+  ntt_asm_clean(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56);
 }
 
-static INLINE void intt_native(poly *data)
+static INLINE void intt_native(int16_t data[MLKEM_N])
 {
-  intt_asm_clean(data->coeffs, aarch64_invntt_zetas_layer01234,
+  intt_asm_clean(data, aarch64_invntt_zetas_layer01234,
                  aarch64_invntt_zetas_layer56);
 }
 
-static INLINE void poly_reduce_native(poly *data)
+static INLINE void poly_reduce_native(int16_t data[MLKEM_N])
 {
-  poly_reduce_asm_clean(data->coeffs);
+  poly_reduce_asm_clean(data);
 }
-static INLINE void poly_tomont_native(poly *data)
+
+static INLINE void poly_tomont_native(int16_t data[MLKEM_N])
 {
-  poly_tomont_asm_clean(data->coeffs);
+  poly_tomont_asm_clean(data);
 }
 
-static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y)
+static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
+                                                const int16_t y[MLKEM_N])
 {
-  poly_mulcache_compute_asm_clean(x->coeffs, y->coeffs,
-                                  aarch64_zetas_mulcache_native,
+  poly_mulcache_compute_asm_clean(x, y, aarch64_zetas_mulcache_native,
                                   aarch64_zetas_mulcache_twisted_native);
 }
+
 static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache)
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)])
 {
-  polyvec_basemul_acc_montgomery_cached_asm_clean(
-      r->coeffs, a->vec[0].coeffs, b->vec[0].coeffs, b_cache->vec[0].coeffs);
+  polyvec_basemul_acc_montgomery_cached_asm_clean(r, a, b, b_cache);
 }
 
 static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a)
+                                       const int16_t a[MLKEM_N])
 {
-  poly_tobytes_asm_clean(r, a->coeffs);
+  poly_tobytes_asm_clean(r, a);
 }
 
 static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/consts.h
similarity index 94%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/consts.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/consts.h
index c40947299..e3ea26a27 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/consts.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/consts.h
@@ -7,7 +7,7 @@
 #define MLKEM_NATIVE_AARCH64_CONSTS
 
 #include <stdint.h>
-#include "common.h"
+#include "../../../common.h"
 
 #define zetas_mulcache_native MLKEM_NAMESPACE(zetas_mulcache_native)
 extern const int16_t zetas_mulcache_native[256];
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S
index b243a569d..28ad38975 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S
@@ -23,7 +23,7 @@
 /// SOFTWARE.
 ///
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
 // Bounds:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S
index c94746e17..857c729cb 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S
@@ -23,7 +23,7 @@
 /// SOFTWARE.
 ///
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
 // Bounds:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S
index cd63cc4d6..30fdc76b0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S
@@ -24,7 +24,7 @@
 /// SOFTWARE.
 ///
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
 // Bounds:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_opt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_opt.S
index 8705615b7..431f9dc6f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_opt.S
@@ -24,7 +24,7 @@
 /// SOFTWARE.
 ///
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
 // Bounds:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/opt_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/opt_impl.h
similarity index 58%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/opt_impl.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/opt_impl.h
index ec1bf6587..eb8e39ed0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/opt_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/opt_impl.h
@@ -10,11 +10,9 @@
 #else
 #define MLKEM_NATIVE_ARITH_PROFILE_IMPL_H
 
+#include "../../../params.h"
 #include "arith_native_aarch64.h"
 
-#include "poly.h"
-#include "polyvec.h"
-
 /* Set of primitives that this backend replaces */
 #define MLKEM_USE_NATIVE_NTT
 #define MLKEM_USE_NATIVE_INTT
@@ -25,45 +23,46 @@
 #define MLKEM_USE_NATIVE_POLY_TOBYTES
 #define MLKEM_USE_NATIVE_REJ_UNIFORM
 
-static INLINE void ntt_native(poly *data)
+static INLINE void ntt_native(int16_t data[MLKEM_N])
 {
-  ntt_asm_opt(data->coeffs, aarch64_ntt_zetas_layer01234,
-              aarch64_ntt_zetas_layer56);
+  ntt_asm_opt(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56);
 }
 
-static INLINE void intt_native(poly *data)
+static INLINE void intt_native(int16_t data[MLKEM_N])
 {
-  intt_asm_opt(data->coeffs, aarch64_invntt_zetas_layer01234,
+  intt_asm_opt(data, aarch64_invntt_zetas_layer01234,
                aarch64_invntt_zetas_layer56);
 }
 
-static INLINE void poly_reduce_native(poly *data)
+static INLINE void poly_reduce_native(int16_t data[MLKEM_N])
 {
-  poly_reduce_asm_opt(data->coeffs);
+  poly_reduce_asm_opt(data);
 }
-static INLINE void poly_tomont_native(poly *data)
+
+static INLINE void poly_tomont_native(int16_t data[MLKEM_N])
 {
-  poly_tomont_asm_opt(data->coeffs);
+  poly_tomont_asm_opt(data);
 }
 
-static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y)
+static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
+                                                const int16_t y[MLKEM_N])
 {
-  poly_mulcache_compute_asm_opt(x->coeffs, y->coeffs,
-                                aarch64_zetas_mulcache_native,
+  poly_mulcache_compute_asm_opt(x, y, aarch64_zetas_mulcache_native,
                                 aarch64_zetas_mulcache_twisted_native);
 }
+
 static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache)
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)])
 {
-  polyvec_basemul_acc_montgomery_cached_asm_opt(
-      r->coeffs, a->vec[0].coeffs, b->vec[0].coeffs, b_cache->vec[0].coeffs);
+  polyvec_basemul_acc_montgomery_cached_asm_opt(r, a, b, b_cache);
 }
 
 static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a)
+                                       const int16_t a[MLKEM_N])
 {
-  poly_tobytes_asm_opt(r, a->coeffs);
+  poly_tobytes_asm_opt(r, a);
 }
 
 static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/optimize.sh b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/optimize.sh
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/optimize.sh
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/optimize.sh
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_clean.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_clean.S
index 809f9667e..f3ee0796f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_clean.S
@@ -3,7 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
 /*
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S
index 815a9dd1a..555c60a67 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S
@@ -3,7 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
 /*
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S
index c91675b44..0b6df6345 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S
@@ -9,7 +9,7 @@
 // https://eprint.iacr.org/2021/986
 // https://github.com/neon-ntt/neon-ntt
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
 // Input:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S
index 8300b682c..7a27fda3e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S
@@ -9,7 +9,7 @@
 // https://eprint.iacr.org/2021/986
 // https://github.com/neon-ntt/neon-ntt
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
 // Input:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
index 5151a05d0..9158d6c82 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
@@ -18,7 +18,7 @@
  *
  * Returns number of sampled 16-bit integers (at most MLKEM_N).
  **************************************************/
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
     defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_table.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c
index 507660349..29cdbe95f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_table.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c
@@ -8,7 +8,7 @@
  *          Do not modify it directly.
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
     defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/api.h
similarity index 90%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/api.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/api.h
index 792ecb8a4..0704f9dcd 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/api.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/api.h
@@ -23,8 +23,7 @@
 #define MLKEM_NATIVE_ARITH_NATIVE_API_H
 
 #include <stdint.h>
-#include "poly.h"
-#include "polyvec.h"
+#include "../common.h"
 
 /*
  * This is the C<->native interface allowing for the drop-in of
@@ -65,9 +64,9 @@
  *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
  *              for more information.
  *
- * Arguments:   - poly *p: pointer to in/output polynomial
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
  **************************************************/
-static INLINE void ntt_native(poly *);
+static INLINE void ntt_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_NTT */
 
 #if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
@@ -96,10 +95,10 @@ and to/from bytes conversions."
  *
  *              This must only be defined if there is native code for
  *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
- * Arguments:   - poly *p: pointer to in/output polynomial
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
  *
  **************************************************/
-static INLINE void poly_permute_bitrev_to_custom(poly *);
+static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
 
 #if defined(MLKEM_USE_NATIVE_INTT)
@@ -117,7 +116,7 @@ static INLINE void poly_permute_bitrev_to_custom(poly *);
  *
  * Arguments:   - uint16_t *a: pointer to in/output polynomial
  **************************************************/
-static INLINE void intt_native(poly *);
+static INLINE void intt_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_INTT */
 
 #if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
@@ -126,9 +125,9 @@ static INLINE void intt_native(poly *);
  *
  * Description: Applies modular reduction to all coefficients of a polynomial.
  *
- * Arguments:   - poly *r: pointer to input/output polynomial
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
  **************************************************/
-static INLINE void poly_reduce_native(poly *);
+static INLINE void poly_reduce_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
 
 #if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
@@ -138,9 +137,9 @@ static INLINE void poly_reduce_native(poly *);
  * Description: Inplace conversion of all coefficients of a polynomial
  *              from normal domain to Montgomery domain
  *
- * Arguments:   - poly *r: pointer to input/output polynomial
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
  **************************************************/
-static INLINE void poly_tomont_native(poly *);
+static INLINE void poly_tomont_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
 #if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
@@ -165,8 +164,8 @@ static INLINE void poly_tomont_native(poly *);
  *              OUTPUT
  *              - cache: pointer to multiplication cache
  **************************************************/
-static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
-                                                const poly *poly);
+static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2],
+                                                const int16_t poly[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 
 #if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
@@ -189,8 +188,9 @@ static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
  *                   in NTT domain, and of the same order as a and b.
  **************************************************/
 static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache);
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]);
 #endif
 
 #if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
@@ -209,7 +209,7 @@ static INLINE void polyvec_basemul_acc_montgomery_cached_native(
  *                   (of MLKEM_POLYBYTES bytes)
  **************************************************/
 static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a);
+                                       const int16_t a[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
 
 #if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
@@ -226,7 +226,7 @@ static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
  *              - a: const pointer to input byte aray
  *                   (of MLKEM_POLYBYTES bytes)
  **************************************************/
-static INLINE void poly_frombytes_native(poly *a,
+static INLINE void poly_frombytes_native(int16_t a[MLKEM_N],
                                          const uint8_t r[MLKEM_POLYBYTES]);
 #endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/default.h
similarity index 97%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/default.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/default.h
index d1e41c52e..f9fe4310a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/default.h
@@ -8,7 +8,7 @@
 /*
  * Default arithmetic backend
  */
-#include "sys.h"
+#include "../sys.h"
 
 #ifdef SYS_AARCH64
 /*
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.c
deleted file mode 100644
index 3651c8da9..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-
-#include <stdint.h>
-#include "arith_backend.h"
-#include "debug.h"
-#include "ntt.h"
-#include "reduce.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
-#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
-#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
-/* End of static namespacing */
-
-#if !defined(MLKEM_USE_NATIVE_NTT)
-/*
- * Computes a block CT butterflies with a fixed twiddle factor,
- * using Montgomery multiplication.
- * Parameters:
- * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
- * - root: Twiddle factor to use for the butterfly. This must be in
- *         Montgomery form and signed canonical.
- * - start: Offset to the beginning of the butterfly block
- * - len: Index difference between coefficients subject to a butterfly
- * - bound: Ghost variable describing coefficient bound: Prior to `start`,
- *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
- *          they must be bound by `bound`.
- * When this function returns, output coefficients in the index range
- * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
- * Example:
- * - start=8, len=4
- *   This would compute the following four butterflies
- *          8     --    12
- *             9    --     13
- *                10   --     14
- *                   11   --     15
- * - start=4, len=2
- *   This would compute the following two butterflies
- *          4 -- 6
- *             5 -- 7
- */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
-                                unsigned start, unsigned len, int bound)
-__contract__(
-  requires(start < MLKEM_N)
-  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
-  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
-  requires(-HALF_Q < zeta && zeta < HALF_Q)
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
-  requires(array_abs_bound(r, start, MLKEM_N, bound))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
-  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
-{
-  /* `bound` is a ghost variable only needed in the CBMC specification */
-  unsigned j;
-  ((void)bound);
-  for (j = start; j < start + len; j++)
-  __loop__(
-    invariant(start <= j && j <= start + len)
-    /*
-     * Coefficients are updated in strided pairs, so the bounds for the
-     * intermediate states alternate twice between the old and new bound
-     */
-    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j,           start + len, bound))
-    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
-  {
-    int16_t t;
-    t = fqmul(r[j + len], zeta);
-    r[j + len] = r[j] - t;
-    r[j] = r[j] + t;
-  }
-}
-
-/*
- *Compute one layer of forward NTT
- * Parameters:
- * - r: Pointer to base of polynomial
- * - len: Stride of butterflies in this layer.
- * - layer: Ghost variable indicating which layer is being applied.
- *          Must match `len` via `len == MLKEM_N >> layer`.
- * Note: `len` could be dropped and computed in the function, but
- *   we are following the structure of the reference NTT from the
- *   official Kyber implementation here, merely adding `layer` as
- *   a ghost variable for the specifications.
- */
-static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
-  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable only needed in the CBMC specification */
-  ((void)layer);
-  /* Twiddle factors for layer n start at index 2^(layer-1) */
-  k = MLKEM_N / (2 * len);
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(start < MLKEM_N + 2 * len)
-    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
-    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
-    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
-  {
-    int16_t zeta = zetas[k++];
-    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
-  }
-}
-
-/*
- * Compute full forward NTT
- * NOTE: This particular implementation satisfies a much tighter
- * bound on the output coefficients (5*q) than the contractual one (8*q),
- * but this is not needed in the calling code. Should we change the
- * base multiplication strategy to require smaller NTT output bounds,
- * the proof may need strengthening.
- */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  unsigned len, layer;
-  int16_t *r;
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  r = p->coeffs;
-
-  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
-  __loop__(
-    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
-    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
-  {
-    ntt_layer(r, len, layer);
-  }
-
-  /* Check the stronger bound */
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_NTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  ntt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_NTT */
-
-#if !defined(MLKEM_USE_NATIVE_INTT)
-
-/* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
-  requires(len == (1 << (8 - layer)))
-  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable used only in the specification */
-  ((void)layer);
-  k = MLKEM_N / len - 1;
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(start <= MLKEM_N && k <= 127)
-    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
-    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
-  {
-    unsigned j;
-    int16_t zeta = zetas[k--];
-    for (j = start; j < start + len; j++)
-    __loop__(
-      invariant(start <= j && j <= start + len)
-      invariant(start <= MLKEM_N && k <= 127)
-      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-    {
-      int16_t t = r[j];
-      r[j] = barrett_reduce(t + r[j + len]);
-      r[j + len] = r[j + len] - t;
-      r[j + len] = fqmul(r[j + len], zeta);
-    }
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  /*
-   * Scale input polynomial to account for Montgomery factor
-   * and NTT twist. This also brings coefficients down to
-   * absolute value < MLKEM_Q.
-   */
-  unsigned j, len, layer;
-  const int16_t f = 1441;
-  int16_t *r = p->coeffs;
-
-  for (j = 0; j < MLKEM_N; j++)
-  __loop__(
-    invariant(j <= MLKEM_N)
-    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
-  {
-    r[j] = fqmul(r[j], f);
-  }
-
-  /* Run the invNTT layers */
-  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
-  __loop__(
-    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-  {
-    invntt_layer(p->coeffs, len, layer);
-  }
-
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  intt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-{
-  int32_t t0, t1;
-  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
-
-  t0 = (int32_t)a[1] * b_cached;
-  t0 += (int32_t)a[0] * b[0];
-  t1 = (int32_t)a[0] * b[1];
-  t1 += (int32_t)a[1] * b[0];
-
-  /* |ti| < 2 * q * 2^15 */
-  r[0] = montgomery_reduce(t0);
-  r[1] = montgomery_reduce(t1);
-
-  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
-}
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
-int empty_cu_ntt;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.h
deleted file mode 100644
index 4e80d3ab3..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/ntt.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef NTT_H
-#define NTT_H
-#include "common.h"
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "poly.h"
-#include "reduce.h"
-
-#define zetas MLKEM_NAMESPACE(zetas)
-extern const int16_t zetas[128];
-
-#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
-/*************************************************
- * Name:        poly_ntt
- *
- * Description: Computes negacyclic number-theoretic transform (NTT) of
- *              a polynomial in place.
- *
- *              The input is assumed to be in normal order and
- *              coefficient-wise bound by MLKEM_Q in absolute value.
- *
- *              The output polynomial is in bitreversed order, and
- *              coefficient-wise bound by NTT_BOUND in absolute value.
- *
- *              (NOTE: Sometimes the input to the NTT is actually smaller,
- *               which gives better bounds.)
- *
- * Arguments:   - poly *p: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
-);
-
-#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
-/*************************************************
- * Name:        poly_invntt_tomont
- *
- * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
- *              of a polynomial in place;
- *              inputs assumed to be in bitreversed order, output in normal
- *              order
- *
- *              The input is assumed to be in bitreversed order, and can
- *              have arbitrary coefficients in int16_t.
- *
- *              The output polynomial is in normal order, and
- *              coefficient-wise bound by INVNTT_BOUND in absolute value.
- *
- * Arguments:   - uint16_t *a: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
-);
-
-#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
-/************************************************************
- * Name: basemul_cached
- *
- * Description: Computes a representative modulo q of
- *              (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536
- *
- *              If b_cached is b1*zeta, this represents the
- *              product of (a0 + a1*X) and (b0 + b1*X) in
- *              Fq[X]/(X^2 - zeta).
- *
- * Arguments: - r: Pointer to output polynomial
- *                   Upon return, coefficients are bound by
- *                   2*MLKEM_Q in absolute value.
- *            - a: Pointer to first input polynomial
- *                   Every coefficient must be in [0..4095]
- *            - b: Pointer to second input polynomial
- *                   Can have arbitrary int16_t coefficients
- *            - b_cached: Some precomputed value, typically derived from
- *                   b1 and a twiddle factor. Can be an arbitary int16_t.
- ************************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-__contract__(
-  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
-  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
-  assigns(memory_slice(r, 2 * sizeof(int16_t)))
-  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
-);
-
-#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/params.h
index 57ea4c8ba..7f6c12625 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/params.h
@@ -18,6 +18,7 @@
 #define MLKEM_N 256
 #define MLKEM_Q 3329
 #define UINT12_LIMIT 4096
+#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
 
 #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */
 #define MLKEM_SSBYTES 32  /* size in bytes of shared key */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.c
index 7483ebf6d..e8a2e2c6e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.c
@@ -8,388 +8,246 @@
 #include <stdint.h>
 #include <string.h>
 #include "arith_backend.h"
-#include "cbd.h"
 #include "cbmc.h"
 #include "debug.h"
 #include "fips202x4.h"
-#include "ntt.h"
 #include "poly.h"
-#include "reduce.h"
+#include "sampling.h"
 #include "symmetric.h"
 #include "verify.h"
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j <= MLKEM_N / 4))
-  {
-    unsigned k;
-    uint16_t t[4];
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(forall(r, 0, k, t[r] < (1u << 10))))
-    {
-      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
-    }
-
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 10-bit in size.
-     */
-    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
-    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
-    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
-    r[5 * j + 4] = (t[3] >> 2);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
-  {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
+#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
+#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
+#define fqmul MLKEM_NAMESPACE(fqmul)
+#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
+#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
+#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
+#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
+#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
+#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        cast_uint16_to_int16
+ *
+ * Description: Cast uint16 value to int16
+ *
+ * Returns:
+ *   input x in     0 .. 32767: returns value unchanged
+ *   input x in 32768 .. 65535: returns (x - 65536)
+ **************************************************/
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+ALWAYS_INLINE
+static INLINE int16_t cast_uint16_to_int16(uint16_t x)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 4)
-    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[4];
-    uint8_t const *base = &a[5 * j];
-
-    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
-    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
-    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
-
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  /*
+   * PORTABILITY: This relies on uint16_t -> int16_t
+   * being implemented as the inverse of int16_t -> uint16_t,
+   * which is implementation-defined (C99 6.3.1.3 (3))
+   * CBMC (correctly) fails to prove this conversion is OK,
+   * so we have to suppress that check here
+   */
+  return (int16_t)x;
 }
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+/*************************************************
+ * Name:        montgomery_reduce_generic
+ *
+ * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
+ *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
+ *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
+ *
+ **************************************************/
+ALWAYS_INLINE
+static INLINE int16_t montgomery_reduce_generic(int32_t a)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
+  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 32)))
-    {
-      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
-    }
+  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
+  const uint16_t a_reduced = a & UINT16_MAX;
+  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
 
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC, and use array indexing into
-     * r rather than pointer-arithmetic to simplify verification
-     */
-    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
-    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
-    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
-    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
-    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
-  }
-}
+  /* Lift to signed canonical representative mod 2^16. */
+  const int16_t t = cast_uint16_to_int16(a_inverted);
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  int32_t r = a - ((int32_t)t * MLKEM_Q);
+  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
 
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j <= MLKEM_N / 8))
-  {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
-    {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
-    }
+  /*
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  r = r >> 16;
+  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
+   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
+   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *
+   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
+   */
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
-  }
+  return (int16_t)r;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+/*************************************************
+ * Name:        montgomery_reduce
+ *
+ * Description: Montgomery reduction
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q,
+ *              smaller than 2 * q in absolute value.
+ **************************************************/
+static INLINE int16_t montgomery_reduce(int32_t a)
+__contract__(
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
+  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
+)
 {
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    uint8_t t[8];
-    const unsigned offset = i * 5;
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC and unwind loop for ease
-     * of proof.
-     */
-
-    /*
-     * Decompress 5 8-bit bytes (so 40 bits) into
-     * 8 5-bit values stored in t[]
-     */
-    t[0] = 0x1F & (a[offset + 0] >> 0);
-    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
-    t[2] = 0x1F & (a[offset + 1] >> 2);
-    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
-    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
-    t[5] = 0x1F & (a[offset + 3] >> 1);
-    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
-    t[7] = 0x1F & (a[offset + 4] >> 3);
-
-    /* and copy to the correct slice in r[] */
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(j <= 8 && i <= MLKEM_N / 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  int16_t res;
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
+
+  res = montgomery_reduce_generic(a);
+  /* Bounds:
+   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
+   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
+   *        < 2 * MLKEM_Q */
+
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
+  return res;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+    !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        fqmul
+ *
+ * Description: Montgomery multiplication modulo q=3329
+ *
+ * Arguments:   - int16_t a: first factor
+ *                  Can be any int16_t.
+ *              - int16_t b: second factor.
+ *                  Must be signed canonical (abs value <(q+1)/2)
+ *
+ * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
+ * smaller than q in absolute value.
+ *
+ **************************************************/
+static INLINE int16_t fqmul(int16_t a, int16_t b)
+__contract__(
+  requires(b > -HALF_Q)
+  requires(b < HALF_Q)
+  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
+)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
-  }
+  int16_t res;
+  debug_assert_abs_bound(&b, 1, HALF_Q);
+
+  res = montgomery_reduce((int32_t)a * (int32_t)b);
+  /* Bounds:
+   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
+   *        < MLKEM_Q
+   */
 
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
+  return res;
 }
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
-
-#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+          !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+          !defined(MLKEM_USE_NATIVE_NTT) ||                   \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        barrett_reduce
+ *
+ * Description: Barrett reduction; given a 16-bit integer a, computes
+ *              centered representative congruent to a mod q in
+ *              {-(q-1)/2,...,(q-1)/2}
+ *
+ * Arguments:   - int16_t a: input integer to be reduced
+ *
+ * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
+ **************************************************/
+static INLINE int16_t barrett_reduce(int16_t a)
+__contract__(
+  ensures(return_value > -HALF_Q && return_value < HALF_Q)
+)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i <= MLKEM_N / 2))
-  {
-    const uint16_t t0 = a->coeffs[2 * i];
-    const uint16_t t1 = a->coeffs[2 * i + 1];
-    /*
-     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
-     * significant data, so these can be packed into 24 bits or exactly
-     * 3 bytes, as follows.
-     */
-
-    /* Least significant bits 0 - 7 of t0. */
-    r[3 * i + 0] = t0 & 0xFF;
-
-    /*
-     * Most significant bits 8 - 11 of t0 become the least significant
-     * nibble of the second byte. The least significant 4 bits
-     * of t1 become the upper nibble of the second byte.
-     */
-    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+  /*
+   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
+   * multiplier is round_to_nearest(2**26/MLKEM_Q)
+   */
+  const int BPOWER = 26;
+  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
 
-    /* Bits 4 - 11 of t1 become the third byte. */
-    r[3 * i + 2] = t1 >> 4;
-  }
-}
-#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-{
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  poly_tobytes_native(r, a);
-}
-#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+  /*
+   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
+   * above and shift by BPOWER places.
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
 
-#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
-  {
-    const uint8_t t0 = a[3 * i + 0];
-    const uint8_t t1 = a[3 * i + 1];
-    const uint8_t t2 = a[3 * i + 2];
-    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
-    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
-  }
+  /*
+   * t is in -10 .. +10, so we need 32-bit math to
+   * evaluate t * MLKEM_Q and the subsequent subtraction
+   */
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
 
-  /* Note that the coefficients are not canonical */
-  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
-}
-#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  poly_frombytes_native(r, a);
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
-#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
+                           int16_t b_cached)
+__contract__(
+  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
+  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
+  assigns(memory_slice(r, 2 * sizeof(int16_t)))
+  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)))
 {
-  unsigned i;
-#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
-#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
-#endif
+  int32_t t0, t1;
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <  MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      /* Prevent the compiler from recognizing this as a bit selection */
-      uint8_t mask = value_barrier_u8(1u << j);
-      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
-    }
-  }
-  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
-}
+  t0 = (int32_t)a[1] * b_cached;
+  t0 += (int32_t)a[0] * b[0];
+  t1 = (int32_t)a[0] * b[1];
+  t1 += (int32_t)a[1] * b[0];
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* |ti| < 2 * q * 2^15 */
+  r[0] = montgomery_reduce(t0);
+  r[1] = montgomery_reduce(t1);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    msg[i] = 0;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8))
-    {
-      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
-      msg[i] |= t << j;
-    }
-  }
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -434,12 +292,46 @@ void poly_tomont(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
-  poly_tomont_native(r);
+  poly_tomont_native(r->coeffs);
   debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
 #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/************************************************************
+ * Name: scalar_signed_to_unsigned_q
+ *
+ * Description: converts signed polynomial coefficient
+ *              from signed (-3328 .. 3328) form to
+ *              unsigned form (0 .. 3328).
+ *
+ * Note: Cryptographic constant time implementation
+ *
+ * Examples:       0 -> 0
+ *                 1 -> 1
+ *              3328 -> 3328
+ *                -1 -> 3328
+ *                -2 -> 3327
+ *             -3328 -> 1
+ *
+ * Arguments: c: signed coefficient to be converted
+ ************************************************************/
+static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
+__contract__(
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
+  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
+{
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
+  /* Add Q if c is negative, but in constant time */
+  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
+
+  /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
+  return (uint16_t)c;
+}
+
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
@@ -461,7 +353,7 @@ void poly_reduce(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
-  poly_reduce_native(r);
+  poly_reduce_native(r->coeffs);
   debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
@@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
-  poly_mulcache_compute_native(x, a);
+  poly_mulcache_compute_native(x->coeffs, a->coeffs);
   /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 
+#if !defined(MLKEM_USE_NATIVE_NTT)
+/*
+ * Computes a block CT butterflies with a fixed twiddle factor,
+ * using Montgomery multiplication.
+ * Parameters:
+ * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
+ * - root: Twiddle factor to use for the butterfly. This must be in
+ *         Montgomery form and signed canonical.
+ * - start: Offset to the beginning of the butterfly block
+ * - len: Index difference between coefficients subject to a butterfly
+ * - bound: Ghost variable describing coefficient bound: Prior to `start`,
+ *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
+ *          they must be bound by `bound`.
+ * When this function returns, output coefficients in the index range
+ * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
+ * Example:
+ * - start=8, len=4
+ *   This would compute the following four butterflies
+ *          8     --    12
+ *             9    --     13
+ *                10   --     14
+ *                   11   --     15
+ * - start=4, len=2
+ *   This would compute the following two butterflies
+ *          4 -- 6
+ *             5 -- 7
+ */
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
+__contract__(
+  requires(start < MLKEM_N)
+  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
+  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
+  requires(-HALF_Q < zeta && zeta < HALF_Q)
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
+  requires(array_abs_bound(r, start, MLKEM_N, bound))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
+  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
+{
+  /* `bound` is a ghost variable only needed in the CBMC specification */
+  unsigned j;
+  ((void)bound);
+  for (j = start; j < start + len; j++)
+  __loop__(
+    invariant(start <= j && j <= start + len)
+    /*
+     * Coefficients are updated in strided pairs, so the bounds for the
+     * intermediate states alternate twice between the old and new bound
+     */
+    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j,           start + len, bound))
+    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
+  {
+    int16_t t;
+    t = fqmul(r[j + len], zeta);
+    r[j + len] = r[j] - t;
+    r[j] = r[j] + t;
+  }
+}
+
+/*
+ *Compute one layer of forward NTT
+ * Parameters:
+ * - r: Pointer to base of polynomial
+ * - len: Stride of butterflies in this layer.
+ * - layer: Ghost variable indicating which layer is being applied.
+ *          Must match `len` via `len == MLKEM_N >> layer`.
+ * Note: `len` could be dropped and computed in the function, but
+ *   we are following the structure of the reference NTT from the
+ *   official Kyber implementation here, merely adding `layer` as
+ *   a ghost variable for the specifications.
+ */
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
+  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable only needed in the CBMC specification */
+  ((void)layer);
+  /* Twiddle factors for layer n start at index 2^(layer-1) */
+  k = MLKEM_N / (2 * len);
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
+    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
+  {
+    int16_t zeta = zetas[k++];
+    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
+  }
+}
+
+/*
+ * Compute full forward NTT
+ * NOTE: This particular implementation satisfies a much tighter
+ * bound on the output coefficients (5*q) than the contractual one (8*q),
+ * but this is not needed in the calling code. Should we change the
+ * base multiplication strategy to require smaller NTT output bounds,
+ * the proof may need strengthening.
+ */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  unsigned len, layer;
+  int16_t *r;
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  r = p->coeffs;
+
+  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
+  __loop__(
+    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
+    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
+  {
+    ntt_layer(r, len, layer);
+  }
+
+  /* Check the stronger bound */
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_NTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  ntt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if !defined(MLKEM_USE_NATIVE_INTT)
+
+/* Compute one layer of inverse NTT */
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
+  requires(len == (1 << (8 - layer)))
+  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable used only in the specification */
+  ((void)layer);
+  k = MLKEM_N / len - 1;
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+    invariant(start <= MLKEM_N && k <= 127)
+    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
+    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
+  {
+    unsigned j;
+    int16_t zeta = zetas[k--];
+    for (j = start; j < start + len; j++)
+    __loop__(
+      invariant(start <= j && j <= start + len)
+      invariant(start <= MLKEM_N && k <= 127)
+      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+    {
+      int16_t t = r[j];
+      r[j] = barrett_reduce(t + r[j + len]);
+      r[j + len] = r[j + len] - t;
+      r[j + len] = fqmul(r[j + len], zeta);
+    }
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  /*
+   * Scale input polynomial to account for Montgomery factor
+   * and NTT twist. This also brings coefficients down to
+   * absolute value < MLKEM_Q.
+   */
+  unsigned j, len, layer;
+  const int16_t f = 1441;
+  int16_t *r = p->coeffs;
+
+  for (j = 0; j < MLKEM_N; j++)
+  __loop__(
+    invariant(j <= MLKEM_N)
+    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
+  {
+    r[j] = fqmul(r[j], f);
+  }
+
+  /* Run the invNTT layers */
+  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
+  __loop__(
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+  {
+    invntt_layer(p->coeffs, len, layer);
+  }
+
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_INTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  intt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_INTT */
+
 #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.h
index 6a14c785d..cb0d67c1a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly.h
@@ -9,7 +9,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "reduce.h"
+#include "debug.h"
 #include "verify.h"
 
 /* Absolute exclusive upper bound for the output of the inverse NTT */
@@ -18,6 +18,9 @@
 /* Absolute exclusive upper bound for the output of the forward NTT */
 #define NTT_BOUND (8 * MLKEM_Q)
 
+#define zetas MLKEM_NAMESPACE(zetas)
+extern const int16_t zetas[128];
+
 /*
  * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
  * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
@@ -38,520 +41,6 @@ typedef struct
   int16_t coeffs[MLKEM_N >> 1];
 } poly_mulcache;
 
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
-#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
-#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
-#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
-#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
-#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
-#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
-#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
-#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
-#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
-/* End of static namespacing */
-
-/************************************************************
- * Name: scalar_compress_d1
- *
- * Description: Computes round(u * 2 / q)
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 1.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d1(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 2)
-  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
-{
-  uint32_t d0 = u << 1;
-  d0 *= 645083;
-  d0 += 1u << 30;
-  d0 >>= 31;
-  return d0;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_compress_d4
- *
- * Description: Computes round(u * 16 / q) % 16
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d4(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 16)
-  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
-{
-  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
-  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d4
- *
- * Description: Computes round(u * q / 16)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d4(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 16)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 8) / 16; }
-
-/************************************************************
- * Name: scalar_compress_d5
- *
- * Description: Computes round(u * 32 / q) % 32
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d5(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 32)
-  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
-{
-  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
-  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d5
- *
- * Description: Computes round(u * q / 32)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 32
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d5(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 32)
-  ensures(return_value <= MLKEM_Q - 1)
-) { return ((u * MLKEM_Q) + 16) / 32; }
-
-/************************************************************
- * Name: scalar_compress_d10
- *
- * Description: Computes round(u * 2**10 / q) % 2**10
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d10(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 10))
-  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
-{
-  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x3FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d10
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d10(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 1024)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 512) / 1024; }
-
-/************************************************************
- * Name: scalar_compress_d11
- *
- * Description: Computes round(u * 2**11 / q) % 2**11
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 11.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d11(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 11))
-  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
-{
-  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x7FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d11
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d11(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 2048)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 1024) / 2048; }
-
-/************************************************************
- * Name: scalar_signed_to_unsigned_q
- *
- * Description: converts signed polynomial coefficient
- *              from signed (-3328 .. 3328) form to
- *              unsigned form (0 .. 3328).
- *
- * Note: Cryptographic constant time implementation
- *
- * Examples:       0 -> 0
- *                 1 -> 1
- *              3328 -> 3328
- *                -1 -> 3328
- *                -2 -> 3327
- *             -3328 -> 1
- *
- * Arguments: c: signed coefficient to be converted
- ************************************************************/
-static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
-__contract__(
-  requires(c > -MLKEM_Q && c < MLKEM_Q)
-  ensures(return_value >= 0 && return_value < MLKEM_Q)
-  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
-{
-  debug_assert_abs_bound(&c, 1, MLKEM_Q);
-
-  /* Add Q if c is negative, but in constant time */
-  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
-
-  /* and therefore cast to uint16_t is safe. */
-  debug_assert_bound(&c, 1, 0, MLKEM_Q);
-  return (uint16_t)c;
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
-    (MLKEM_K == 2 || MLKEM_K == 3)
-#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
-/*************************************************
- * Name:        poly_compress_d4
- *
- * Description: Compression (4 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
-
-#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
-/*************************************************
- * Name:        poly_compress_d10
- *
- * Description: Compression (10 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
-
-#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
-/*************************************************
- * Name:        poly_decompress_d4
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
-
-#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
-/*************************************************
- * Name:        poly_decompress_d10
- *
- * Description: De-serialization and subsequent decompression (10 bits) of a
- *              polynomial; approximate inverse of poly_compress_d10
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
-/*************************************************
- * Name:        poly_compress_d5
- *
- * Description: Compression (5 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
-
-#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
-/*************************************************
- * Name:        poly_compress_d11
- *
- * Description: Compression (11 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
-
-#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
-/*************************************************
- * Name:        poly_decompress_d5
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
-
-#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
-/*************************************************
- * Name:        poly_decompress_d11
- *
- * Description: De-serialization and subsequent decompression (11 bits) of a
- *              polynomial; approximate inverse of poly_compress_d11
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
-        */
-
-#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
-/*************************************************
- * Name:        poly_tobytes
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - a: const pointer to input polynomial,
- *                with each coefficient in the range [0,1,..,Q-1]
- *              OUTPUT
- *              - r: pointer to output byte array
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYBYTES))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
-
-
-#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
-/*************************************************
- * Name:        poly_frombytes
- *
- * Description: De-serialization of a polynomial.
- *
- * Arguments:   INPUT
- *              - a: pointer to input byte array
- *                   (of MLKEM_POLYBYTES bytes)
- *              OUTPUT
- *              - r: pointer to output polynomial, with
- *                   each coefficient unsigned and in the range
- *                   0 .. 4095
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
-);
-
-
-#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
-/*************************************************
- * Name:        poly_frommsg
- *
- * Description: Convert 32-byte message to polynomial
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *msg: pointer to input message
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
-
-#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
-/*************************************************
- * Name:        poly_tomsg
- *
- * Description: Convert polynomial to 32-byte message
- *
- * Arguments:   - uint8_t *msg: pointer to output message
- *              - const poly *r: pointer to input polynomial
- *                Coefficients must be unsigned canonical
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(msg))
-);
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -715,4 +204,56 @@ __contract__(
   assigns(object_whole(r))
 );
 
+#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
+/*************************************************
+ * Name:        poly_ntt
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ *              (NOTE: Sometimes the input to the NTT is actually smaller,
+ *               which gives better bounds.)
+ *
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
+);
+
+#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
+/*************************************************
+ * Name:        poly_invntt_tomont
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place;
+ *              inputs assumed to be in bitreversed order, output in normal
+ *              order
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
+);
+
 #endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly_k.c
similarity index 97%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly_k.c
index 50ea1c34a..c2d330ea9 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly_k.c
@@ -2,13 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include "polyvec.h"
+#include "poly_k.h"
 #include <stdint.h>
 #include <string.h>
 #include "arith_backend.h"
-#include "cbd.h"
-#include "ntt.h"
-#include "poly.h"
+#include "compress.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 #include "debug.h"
@@ -131,7 +130,9 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
   /* Omitting bounds assertion for cache since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
-  polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
+  polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a,
+                                               (const int16_t *)b,
+                                               (const int16_t *)b_cache);
 }
 #endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly_k.h
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly_k.h
index 8be8579e0..0aea95912 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/poly_k.h
@@ -2,11 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#ifndef POLYVEC_H
-#define POLYVEC_H
+#ifndef POLY_K_H
+#define POLY_K_H
 
 #include <stdint.h>
 #include "common.h"
+#include "compress.h"
 #include "poly.h"
 
 #define polyvec MLKEM_NAMESPACE_K(polyvec)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/reduce.h
deleted file mode 100644
index b432a4201..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/reduce.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "common.h"
-#include "debug.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
-#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
-#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
-#define fqmul MLKEM_NAMESPACE(fqmul)
-#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
-/* End of static namespacing */
-
-#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
-
-/*************************************************
- * Name:        cast_uint16_to_int16
- *
- * Description: Cast uint16 value to int16
- *
- * Returns:
- *   input x in     0 .. 32767: returns value unchanged
- *   input x in 32768 .. 65535: returns (x - 65536)
- **************************************************/
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
-ALWAYS_INLINE
-static INLINE int16_t cast_uint16_to_int16(uint16_t x)
-{
-  /*
-   * PORTABILITY: This relies on uint16_t -> int16_t
-   * being implemented as the inverse of int16_t -> uint16_t,
-   * which is implementation-defined (C99 6.3.1.3 (3))
-   * CBMC (correctly) fails to prove this conversion is OK,
-   * so we have to suppress that check here
-   */
-  return (int16_t)x;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/*************************************************
- * Name:        montgomery_reduce_generic
- *
- * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
- *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *
- * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
- *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
- *
- **************************************************/
-ALWAYS_INLINE
-static INLINE int16_t montgomery_reduce_generic(int32_t a)
-{
-  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
-  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
-
-  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
-  const uint16_t a_reduced = a & UINT16_MAX;
-  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
-
-  /* Lift to signed canonical representative mod 2^16. */
-  const int16_t t = cast_uint16_to_int16(a_inverted);
-
-  int32_t r = a - ((int32_t)t * MLKEM_Q);
-  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
-
-  /*
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  r = r >> 16;
-  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
-   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
-   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *
-   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
-   */
-
-  return (int16_t)r;
-}
-
-/*************************************************
- * Name:        montgomery_reduce
- *
- * Description: Montgomery reduction
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
- *
- * Returns:     integer congruent to a * R^-1 modulo q,
- *              smaller than 2 * q in absolute value.
- **************************************************/
-static INLINE int16_t montgomery_reduce(int32_t a)
-__contract__(
-  requires(a > -(2 * UINT12_LIMIT * 32768))
-  requires(a <  (2 * UINT12_LIMIT * 32768))
-  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
-
-  res = montgomery_reduce_generic(a);
-  /* Bounds:
-   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
-   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
-   *        < 2 * MLKEM_Q */
-
-  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        fqmul
- *
- * Description: Montgomery multiplication modulo q=3329
- *
- * Arguments:   - int16_t a: first factor
- *                  Can be any int16_t.
- *              - int16_t b: second factor.
- *                  Must be signed canonical (abs value <(q+1)/2)
- *
- * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
- * smaller than q in absolute value.
- *
- **************************************************/
-static INLINE int16_t fqmul(int16_t a, int16_t b)
-__contract__(
-  requires(b > -HALF_Q)
-  requires(b < HALF_Q)
-  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&b, 1, HALF_Q);
-
-  res = montgomery_reduce((int32_t)a * (int32_t)b);
-  /* Bounds:
-   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
-   *        < MLKEM_Q
-   */
-
-  debug_assert_abs_bound(&res, 1, MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        barrett_reduce
- *
- * Description: Barrett reduction; given a 16-bit integer a, computes
- *              centered representative congruent to a mod q in
- *              {-(q-1)/2,...,(q-1)/2}
- *
- * Arguments:   - int16_t a: input integer to be reduced
- *
- * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
- **************************************************/
-static INLINE int16_t barrett_reduce(int16_t a)
-__contract__(
-  ensures(return_value > -HALF_Q && return_value < HALF_Q)
-)
-{
-  /*
-   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
-   * multiplier is round_to_nearest(2**26/MLKEM_Q)
-   */
-  const int BPOWER = 26;
-  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
-
-  /*
-   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
-   * above and shift by BPOWER places.
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
-
-  /*
-   * t is in -10 .. +10, so we need 32-bit math to
-   * evaluate t * MLKEM_Q and the subsequent subtraction
-   */
-  int16_t res = (int16_t)(a - t * MLKEM_Q);
-
-  debug_assert_abs_bound(&res, 1, HALF_Q);
-  return res;
-}
-
-#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/sampling.c
similarity index 73%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/sampling.c
index cbbe4407f..98cbdcb74 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/sampling.c
@@ -9,7 +9,7 @@
 #include "debug.h"
 #include "fips202.h"
 #include "fips202x4.h"
-#include "rej_uniform.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 /* Static namespacing
@@ -18,6 +18,8 @@
  * within a single compilation unit. */
 #define rej_uniform MLKEM_NAMESPACE(rej_uniform)
 #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
 /* End of static namespacing */
 
 static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
@@ -233,9 +235,113 @@ void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
   xof_release(&state);
 }
 
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        load32_littleendian
+ *
+ * Description: load 4 bytes into a 32-bit integer
+ *              in little-endian order
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x
+ **************************************************/
+static uint32_t load32_littleendian(const uint8_t x[4])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  r |= (uint32_t)x[3] << 24;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
+  {
+    unsigned j;
+    uint32_t t = load32_littleendian(buf + 4 * i);
+    uint32_t d = t & 0x55555555;
+    d += (t >> 1) & 0x55555555;
+
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
+    {
+      const int16_t a = (d >> (4 * j + 0)) & 0x3;
+      const int16_t b = (d >> (4 * j + 2)) & 0x3;
+      r->coeffs[8 * i + j] = a - b;
+    }
+  }
+}
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+/*************************************************
+ * Name:        load24_littleendian
+ *
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+ **************************************************/
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
+  {
+    unsigned j;
+    const uint32_t t = load24_littleendian(buf + 3 * i);
+    uint32_t d = t & 0x00249249;
+    d += (t >> 1) & 0x00249249;
+    d += (t >> 2) & 0x00249249;
+
+    for (j = 0; j < 4; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 4 && j <= 4)
+      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
+    {
+      const int16_t a = (d >> (6 * j + 0)) & 0x7;
+      const int16_t b = (d >> (6 * j + 3)) & 0x7;
+      r->coeffs[4 * i + j] = a - b;
+    }
+  }
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
+
 #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
-int empty_cu_rej_uniform;
+#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling)
+int empty_cu_sampling;
 
 #endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/sampling.h
similarity index 63%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/sampling.h
index 801287259..cc524e0fc 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/sampling.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#ifndef REJ_UNIFORM_H
-#define REJ_UNIFORM_H
+#ifndef SAMPLING_H
+#define SAMPLING_H
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -11,6 +11,37 @@
 #include "common.h"
 #include "poly.h"
 
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
+/*************************************************
+ * Name:        poly_cbd2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
+/*************************************************
+ * Name:        poly_cbd3
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
+
 #define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
 /*************************************************
  * Name:        poly_rej_uniform_x4
@@ -60,4 +91,4 @@ __contract__(
   assigns(memory_slice(entry, sizeof(poly)))
   ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
 
-#endif /* REJ_UNIFORM_H */
+#endif /* SAMPLING_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c
index 4ef887c62..987f0dce4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c
@@ -10,7 +10,7 @@
 
 #include "common.h"
 #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-#include "ntt.h"
+#include "poly.h"
 
 /*
  * Table of zeta values used in the reference NTT and inverse NTT.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_backend.h
index 0543b1bd1..ade31cda1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/arith_backend.h
@@ -17,7 +17,7 @@
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
 #if defined(MLKEM_NATIVE_CHECK_APIS)
-#include "api.h"
+#include "native/api.h"
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c
deleted file mode 100644
index 1e6b7c5d1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
-
-#include <stdint.h>
-#include "cbd.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
-#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-/* End of static namespacing */
-
-/*************************************************
- * Name:        load32_littleendian
- *
- * Description: load 4 bytes into a 32-bit integer
- *              in little-endian order
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x
- **************************************************/
-static uint32_t load32_littleendian(const uint8_t x[4])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  r |= (uint32_t)x[3] << 24;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
-  {
-    unsigned j;
-    uint32_t t = load32_littleendian(buf + 4 * i);
-    uint32_t d = t & 0x55555555;
-    d += (t >> 1) & 0x55555555;
-
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
-    {
-      const int16_t a = (d >> (4 * j + 0)) & 0x3;
-      const int16_t b = (d >> (4 * j + 2)) & 0x3;
-      r->coeffs[8 * i + j] = a - b;
-    }
-  }
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 4)
-    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
-  {
-    unsigned j;
-    const uint32_t t = load24_littleendian(buf + 3 * i);
-    uint32_t d = t & 0x00249249;
-    d += (t >> 1) & 0x00249249;
-    d += (t >> 2) & 0x00249249;
-
-    for (j = 0; j < 4; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 4 && j <= 4)
-      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
-    {
-      const int16_t a = (d >> (6 * j + 0)) & 0x7;
-      const int16_t b = (d >> (6 * j + 3)) & 0x7;
-      r->coeffs[4 * i + j] = a - b;
-    }
-  }
-}
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
-          3 */
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
-int empty_cu_cbd;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h
deleted file mode 100644
index 54c1f5b90..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/cbd.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef CBD_H
-#define CBD_H
-
-#include <stdint.h>
-#include "common.h"
-#include "poly.h"
-
-#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
-/*************************************************
- * Name:        poly_cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
-/*************************************************
- * Name:        poly_cbd3
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
-
-#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h
index 4f326333e..62ed53ab1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h
@@ -15,12 +15,19 @@
 #include "sys.h"
 
 /* Include backend metadata */
-#if defined(MLKEM_USE_NATIVE)
-#if defined(MLKEM_NATIVE_ARITH_BACKEND)
-#include MLKEM_NATIVE_ARITH_BACKEND
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#include MLKEM_NATIVE_ARITH_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not.
+#endif
 #endif
-#if defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#include MLKEM_NATIVE_FIPS202_BACKEND
+
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#include MLKEM_NATIVE_FIPS202_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not.
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/compress.c
new file mode 100644
index 000000000..a03fe0ac4
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/compress.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
+#include <stdint.h>
+#include <string.h>
+#include "arith_backend.h"
+#include "cbmc.h"
+#include "compress.h"
+#include "debug.h"
+#include "verify.h"
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
+    {
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
+    }
+
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(invariant(j <= MLKEM_N / 4))
+  {
+    unsigned k;
+    uint16_t t[4];
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(forall(r, 0, k, t[r] < (1u << 10))))
+    {
+      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 10-bit in size.
+     */
+    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
+    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
+    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
+    r[5 * j + 4] = (t[3] >> 2);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  {
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 4)
+    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[4];
+    uint8_t const *base = &a[5 * j];
+
+    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
+    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
+    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
+
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 32)))
+    {
+      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
+    }
+
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC, and use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     */
+    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
+    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
+    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
+  {
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    uint8_t t[8];
+    const unsigned offset = i * 5;
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC and unwind loop for ease
+     * of proof.
+     */
+
+    /*
+     * Decompress 5 8-bit bytes (so 40 bits) into
+     * 8 5-bit values stored in t[]
+     */
+    t[0] = 0x1F & (a[offset + 0] >> 0);
+    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
+    t[2] = 0x1F & (a[offset + 1] >> 2);
+    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
+    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
+    t[5] = 0x1F & (a[offset + 3] >> 1);
+    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
+    t[7] = 0x1F & (a[offset + 4] >> 3);
+
+    /* and copy to the correct slice in r[] */
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(j <= 8 && i <= MLKEM_N / 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(invariant(i <= MLKEM_N / 2))
+  {
+    const uint16_t t0 = a->coeffs[2 * i];
+    const uint16_t t1 = a->coeffs[2 * i + 1];
+    /*
+     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
+     * significant data, so these can be packed into 24 bits or exactly
+     * 3 bytes, as follows.
+     */
+
+    /* Least significant bits 0 - 7 of t0. */
+    r[3 * i + 0] = t0 & 0xFF;
+
+    /*
+     * Most significant bits 8 - 11 of t0 become the least significant
+     * nibble of the second byte. The least significant 4 bits
+     * of t1 become the upper nibble of the second byte.
+     */
+    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+
+    /* Bits 4 - 11 of t1 become the third byte. */
+    r[3 * i + 2] = t1 >> 4;
+  }
+}
+#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  poly_tobytes_native(r, a->coeffs);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
+  {
+    const uint8_t t0 = a[3 * i + 0];
+    const uint8_t t1 = a[3 * i + 1];
+    const uint8_t t2 = a[3 * i + 2];
+    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
+    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+  }
+
+  /* Note that the coefficients are not canonical */
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  poly_frombytes_native(r->coeffs, a);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+{
+  unsigned i;
+#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
+#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
+#endif
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <  MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      /* Prevent the compiler from recognizing this as a bit selection */
+      uint8_t mask = value_barrier_u8(1u << j);
+      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
+    }
+  }
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    msg[i] = 0;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8))
+    {
+      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
+      msg[i] |= t << j;
+    }
+  }
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress)
+int empty_cu_compress;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/compress.h
new file mode 100644
index 000000000..409dbe519
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/compress.h
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef COMPRESS_H
+#define COMPRESS_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+#include "debug.h"
+#include "poly.h"
+#include "verify.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
+#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
+#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
+#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
+#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
+#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
+#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
+#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
+#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
+/* End of static namespacing */
+
+/************************************************************
+ * Name: scalar_compress_d1
+ *
+ * Description: Computes round(u * 2 / q)
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 1.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d1(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 2)
+  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
+{
+  uint32_t d0 = u << 1;
+  d0 *= 645083;
+  d0 += 1u << 30;
+  d0 >>= 31;
+  return d0;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_compress_d4
+ *
+ * Description: Computes round(u * 16 / q) % 16
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d4(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 16)
+  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
+{
+  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
+  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d4
+ *
+ * Description: Computes round(u * q / 16)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d4(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 16)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 8) / 16; }
+
+/************************************************************
+ * Name: scalar_compress_d5
+ *
+ * Description: Computes round(u * 32 / q) % 32
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d5(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 32)
+  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
+{
+  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
+  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d5
+ *
+ * Description: Computes round(u * q / 32)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 32
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d5(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 32)
+  ensures(return_value <= MLKEM_Q - 1)
+) { return ((u * MLKEM_Q) + 16) / 32; }
+
+/************************************************************
+ * Name: scalar_compress_d10
+ *
+ * Description: Computes round(u * 2**10 / q) % 2**10
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d10(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 10))
+  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
+{
+  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x3FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d10
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d10(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 1024)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 512) / 1024; }
+
+/************************************************************
+ * Name: scalar_compress_d11
+ *
+ * Description: Computes round(u * 2**11 / q) % 2**11
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 11.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d11(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 11))
+  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
+{
+  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x7FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d11
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d11(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 2048)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 1024) / 2048; }
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
+/*************************************************
+ * Name:        poly_compress_d4
+ *
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
+
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
+/*************************************************
+ * Name:        poly_decompress_d4
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
+/*************************************************
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
+
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
+/*************************************************
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
+
+#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
+/*************************************************
+ * Name:        poly_tobytes
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range [0,1,..,Q-1]
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYBYTES))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r))
+);
+
+
+#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
+/*************************************************
+ * Name:        poly_frombytes
+ *
+ * Description: De-serialization of a polynomial.
+ *
+ * Arguments:   INPUT
+ *              - a: pointer to input byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ *              OUTPUT
+ *              - r: pointer to output polynomial, with
+ *                   each coefficient unsigned and in the range
+ *                   0 .. 4095
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
+);
+
+
+#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
+/*************************************************
+ * Name:        poly_frommsg
+ *
+ * Description: Convert 32-byte message to polynomial
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *msg: pointer to input message
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
+#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
+/*************************************************
+ * Name:        poly_tomsg
+ *
+ * Description: Convert polynomial to 32-byte message
+ *
+ * Arguments:   - uint8_t *msg: pointer to output message
+ *              - const poly *r: pointer to input polynomial
+ *                Coefficients must be unsigned canonical
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(msg))
+);
+
+#endif /* COMPRESS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h
index fa89370ce..e975ede95 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h
@@ -122,46 +122,87 @@
 /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
- * Name:        MLKEM_USE_NATIVE
+ * Name:        MLKEM_USE_NATIVE_BACKEND_ARITH
  *
- * Description: Determines whether a native backend should
- *              be used, if available.
+ * Description: Determines whether an native arithmetic backend should be used.
+ *
+ *              The arithmetic backend covers performance critical functions
+ *              such as the number-theoretic transform (NTT).
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the arithmetic backend to be use is
+ *              determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(MLKEM_USE_NATIVE)
-/* #define MLKEM_USE_NATIVE */
+#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */
 #endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_ARITH_BACKEND
+ * Name:        MLKEM_NATIVE_ARITH_BACKEND_FILE
  *
  * Description: The arithmetic backend to use.
  *
- *              This must be the filename of an arithmetic backend.
- *              See the existing backends for examples.
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option
+ *              is ignored.
+ *
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must
+ *              either be undefined or the filename of an arithmetic backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND)
-#define MLKEM_NATIVE_ARITH_BACKEND "default.h"
-#endif /* MLKEM_NATIVE_ARITH_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \
+    !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h"
+#endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_FIPS202_BACKEND
+ * Name:        MLKEM_USE_NATIVE_BACKEND_FIPS202
+ *
+ * Description: Determines whether an native FIPS202 backend should be used.
+ *
+ *              The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is
+ *              the performance bottleneck of SHA3 and SHAKE.
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the FIPS202 backend to be use is
+ *              determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */
+#endif
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_FIPS202_BACKEND_FILE
  *
  * Description: The FIPS-202 backend to use.
  *
- *              This must be the filename of an FIPS-202 backend.
+ *              If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must
+ *              either be undefined or the filename of a FIPS202 backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h"
-#endif /* MLKEM_NATIVE_FIPS202_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \
+    !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h"
+#endif
 
 /*************************  Config internals  ********************************/
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c
index 0cfcc3e9e..318d0fc77 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.c
@@ -9,11 +9,10 @@
 #include "fips202.h"
 #include "fips202x4.h"
 #include "indcpa.h"
-#include "ntt.h"
 #include "poly.h"
-#include "polyvec.h"
+#include "poly_k.h"
 #include "randombytes.h"
-#include "rej_uniform.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 #include "arith_backend.h"
@@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v,
 #define poly_permute_bitrev_to_custom \
   MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
-static INLINE void poly_permute_bitrev_to_custom(poly *data)
+static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
 __contract__(
   /* We don't specify that this should be a permutation, but only
    * that it does not change the bound established at the end of gen_matrix. */
-  requires(memory_no_alias(data, sizeof(poly)))
-  requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
+  requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
   assigns(memory_slice(data, sizeof(poly)))
-  ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
+  ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
 #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
 
 /* Not static for benchmarking */
@@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
   {
     for (j = 0; j < MLKEM_K; j++)
     {
-      poly_permute_bitrev_to_custom(&a[i].vec[j]);
+      poly_permute_bitrev_to_custom(a[i].vec[j].coeffs);
     }
   }
 }
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h
index 2c4fda3c4..b4d5985bf 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/indcpa.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "polyvec.h"
+#include "poly_k.h"
 
 #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/api.h
similarity index 90%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/api.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/api.h
index 792ecb8a4..0704f9dcd 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/api.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/api.h
@@ -23,8 +23,7 @@
 #define MLKEM_NATIVE_ARITH_NATIVE_API_H
 
 #include <stdint.h>
-#include "poly.h"
-#include "polyvec.h"
+#include "../common.h"
 
 /*
  * This is the C<->native interface allowing for the drop-in of
@@ -65,9 +64,9 @@
  *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
  *              for more information.
  *
- * Arguments:   - poly *p: pointer to in/output polynomial
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
  **************************************************/
-static INLINE void ntt_native(poly *);
+static INLINE void ntt_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_NTT */
 
 #if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
@@ -96,10 +95,10 @@ and to/from bytes conversions."
  *
  *              This must only be defined if there is native code for
  *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
- * Arguments:   - poly *p: pointer to in/output polynomial
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
  *
  **************************************************/
-static INLINE void poly_permute_bitrev_to_custom(poly *);
+static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
 
 #if defined(MLKEM_USE_NATIVE_INTT)
@@ -117,7 +116,7 @@ static INLINE void poly_permute_bitrev_to_custom(poly *);
  *
  * Arguments:   - uint16_t *a: pointer to in/output polynomial
  **************************************************/
-static INLINE void intt_native(poly *);
+static INLINE void intt_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_INTT */
 
 #if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
@@ -126,9 +125,9 @@ static INLINE void intt_native(poly *);
  *
  * Description: Applies modular reduction to all coefficients of a polynomial.
  *
- * Arguments:   - poly *r: pointer to input/output polynomial
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
  **************************************************/
-static INLINE void poly_reduce_native(poly *);
+static INLINE void poly_reduce_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
 
 #if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
@@ -138,9 +137,9 @@ static INLINE void poly_reduce_native(poly *);
  * Description: Inplace conversion of all coefficients of a polynomial
  *              from normal domain to Montgomery domain
  *
- * Arguments:   - poly *r: pointer to input/output polynomial
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
  **************************************************/
-static INLINE void poly_tomont_native(poly *);
+static INLINE void poly_tomont_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
 #if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
@@ -165,8 +164,8 @@ static INLINE void poly_tomont_native(poly *);
  *              OUTPUT
  *              - cache: pointer to multiplication cache
  **************************************************/
-static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
-                                                const poly *poly);
+static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2],
+                                                const int16_t poly[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 
 #if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
@@ -189,8 +188,9 @@ static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
  *                   in NTT domain, and of the same order as a and b.
  **************************************************/
 static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache);
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]);
 #endif
 
 #if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
@@ -209,7 +209,7 @@ static INLINE void polyvec_basemul_acc_montgomery_cached_native(
  *                   (of MLKEM_POLYBYTES bytes)
  **************************************************/
 static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a);
+                                       const int16_t a[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
 
 #if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
@@ -226,7 +226,7 @@ static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
  *              - a: const pointer to input byte aray
  *                   (of MLKEM_POLYBYTES bytes)
  **************************************************/
-static INLINE void poly_frombytes_native(poly *a,
+static INLINE void poly_frombytes_native(int16_t a[MLKEM_N],
                                          const uint8_t r[MLKEM_POLYBYTES]);
 #endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/default.h
similarity index 97%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/default.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/default.h
index d1e41c52e..f9fe4310a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/default.h
@@ -8,7 +8,7 @@
 /*
  * Default arithmetic backend
  */
-#include "sys.h"
+#include "../sys.h"
 
 #ifdef SYS_AARCH64
 /*
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c
deleted file mode 100644
index 3651c8da9..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-
-#include <stdint.h>
-#include "arith_backend.h"
-#include "debug.h"
-#include "ntt.h"
-#include "reduce.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
-#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
-#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
-/* End of static namespacing */
-
-#if !defined(MLKEM_USE_NATIVE_NTT)
-/*
- * Computes a block CT butterflies with a fixed twiddle factor,
- * using Montgomery multiplication.
- * Parameters:
- * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
- * - root: Twiddle factor to use for the butterfly. This must be in
- *         Montgomery form and signed canonical.
- * - start: Offset to the beginning of the butterfly block
- * - len: Index difference between coefficients subject to a butterfly
- * - bound: Ghost variable describing coefficient bound: Prior to `start`,
- *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
- *          they must be bound by `bound`.
- * When this function returns, output coefficients in the index range
- * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
- * Example:
- * - start=8, len=4
- *   This would compute the following four butterflies
- *          8     --    12
- *             9    --     13
- *                10   --     14
- *                   11   --     15
- * - start=4, len=2
- *   This would compute the following two butterflies
- *          4 -- 6
- *             5 -- 7
- */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
-                                unsigned start, unsigned len, int bound)
-__contract__(
-  requires(start < MLKEM_N)
-  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
-  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
-  requires(-HALF_Q < zeta && zeta < HALF_Q)
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
-  requires(array_abs_bound(r, start, MLKEM_N, bound))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
-  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
-{
-  /* `bound` is a ghost variable only needed in the CBMC specification */
-  unsigned j;
-  ((void)bound);
-  for (j = start; j < start + len; j++)
-  __loop__(
-    invariant(start <= j && j <= start + len)
-    /*
-     * Coefficients are updated in strided pairs, so the bounds for the
-     * intermediate states alternate twice between the old and new bound
-     */
-    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j,           start + len, bound))
-    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
-  {
-    int16_t t;
-    t = fqmul(r[j + len], zeta);
-    r[j + len] = r[j] - t;
-    r[j] = r[j] + t;
-  }
-}
-
-/*
- *Compute one layer of forward NTT
- * Parameters:
- * - r: Pointer to base of polynomial
- * - len: Stride of butterflies in this layer.
- * - layer: Ghost variable indicating which layer is being applied.
- *          Must match `len` via `len == MLKEM_N >> layer`.
- * Note: `len` could be dropped and computed in the function, but
- *   we are following the structure of the reference NTT from the
- *   official Kyber implementation here, merely adding `layer` as
- *   a ghost variable for the specifications.
- */
-static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
-  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable only needed in the CBMC specification */
-  ((void)layer);
-  /* Twiddle factors for layer n start at index 2^(layer-1) */
-  k = MLKEM_N / (2 * len);
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(start < MLKEM_N + 2 * len)
-    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
-    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
-    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
-  {
-    int16_t zeta = zetas[k++];
-    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
-  }
-}
-
-/*
- * Compute full forward NTT
- * NOTE: This particular implementation satisfies a much tighter
- * bound on the output coefficients (5*q) than the contractual one (8*q),
- * but this is not needed in the calling code. Should we change the
- * base multiplication strategy to require smaller NTT output bounds,
- * the proof may need strengthening.
- */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  unsigned len, layer;
-  int16_t *r;
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  r = p->coeffs;
-
-  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
-  __loop__(
-    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
-    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
-  {
-    ntt_layer(r, len, layer);
-  }
-
-  /* Check the stronger bound */
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_NTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  ntt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_NTT */
-
-#if !defined(MLKEM_USE_NATIVE_INTT)
-
-/* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
-  requires(len == (1 << (8 - layer)))
-  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable used only in the specification */
-  ((void)layer);
-  k = MLKEM_N / len - 1;
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(start <= MLKEM_N && k <= 127)
-    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
-    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
-  {
-    unsigned j;
-    int16_t zeta = zetas[k--];
-    for (j = start; j < start + len; j++)
-    __loop__(
-      invariant(start <= j && j <= start + len)
-      invariant(start <= MLKEM_N && k <= 127)
-      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-    {
-      int16_t t = r[j];
-      r[j] = barrett_reduce(t + r[j + len]);
-      r[j + len] = r[j + len] - t;
-      r[j + len] = fqmul(r[j + len], zeta);
-    }
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  /*
-   * Scale input polynomial to account for Montgomery factor
-   * and NTT twist. This also brings coefficients down to
-   * absolute value < MLKEM_Q.
-   */
-  unsigned j, len, layer;
-  const int16_t f = 1441;
-  int16_t *r = p->coeffs;
-
-  for (j = 0; j < MLKEM_N; j++)
-  __loop__(
-    invariant(j <= MLKEM_N)
-    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
-  {
-    r[j] = fqmul(r[j], f);
-  }
-
-  /* Run the invNTT layers */
-  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
-  __loop__(
-    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-  {
-    invntt_layer(p->coeffs, len, layer);
-  }
-
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  intt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-{
-  int32_t t0, t1;
-  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
-
-  t0 = (int32_t)a[1] * b_cached;
-  t0 += (int32_t)a[0] * b[0];
-  t1 = (int32_t)a[0] * b[1];
-  t1 += (int32_t)a[1] * b[0];
-
-  /* |ti| < 2 * q * 2^15 */
-  r[0] = montgomery_reduce(t0);
-  r[1] = montgomery_reduce(t1);
-
-  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
-}
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
-int empty_cu_ntt;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h
deleted file mode 100644
index 4e80d3ab3..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/ntt.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef NTT_H
-#define NTT_H
-#include "common.h"
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "poly.h"
-#include "reduce.h"
-
-#define zetas MLKEM_NAMESPACE(zetas)
-extern const int16_t zetas[128];
-
-#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
-/*************************************************
- * Name:        poly_ntt
- *
- * Description: Computes negacyclic number-theoretic transform (NTT) of
- *              a polynomial in place.
- *
- *              The input is assumed to be in normal order and
- *              coefficient-wise bound by MLKEM_Q in absolute value.
- *
- *              The output polynomial is in bitreversed order, and
- *              coefficient-wise bound by NTT_BOUND in absolute value.
- *
- *              (NOTE: Sometimes the input to the NTT is actually smaller,
- *               which gives better bounds.)
- *
- * Arguments:   - poly *p: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
-);
-
-#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
-/*************************************************
- * Name:        poly_invntt_tomont
- *
- * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
- *              of a polynomial in place;
- *              inputs assumed to be in bitreversed order, output in normal
- *              order
- *
- *              The input is assumed to be in bitreversed order, and can
- *              have arbitrary coefficients in int16_t.
- *
- *              The output polynomial is in normal order, and
- *              coefficient-wise bound by INVNTT_BOUND in absolute value.
- *
- * Arguments:   - uint16_t *a: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
-);
-
-#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
-/************************************************************
- * Name: basemul_cached
- *
- * Description: Computes a representative modulo q of
- *              (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536
- *
- *              If b_cached is b1*zeta, this represents the
- *              product of (a0 + a1*X) and (b0 + b1*X) in
- *              Fq[X]/(X^2 - zeta).
- *
- * Arguments: - r: Pointer to output polynomial
- *                   Upon return, coefficients are bound by
- *                   2*MLKEM_Q in absolute value.
- *            - a: Pointer to first input polynomial
- *                   Every coefficient must be in [0..4095]
- *            - b: Pointer to second input polynomial
- *                   Can have arbitrary int16_t coefficients
- *            - b_cached: Some precomputed value, typically derived from
- *                   b1 and a twiddle factor. Can be an arbitary int16_t.
- ************************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-__contract__(
-  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
-  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
-  assigns(memory_slice(r, 2 * sizeof(int16_t)))
-  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
-);
-
-#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h
index 57ea4c8ba..7f6c12625 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/params.h
@@ -18,6 +18,7 @@
 #define MLKEM_N 256
 #define MLKEM_Q 3329
 #define UINT12_LIMIT 4096
+#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
 
 #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */
 #define MLKEM_SSBYTES 32  /* size in bytes of shared key */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c
index 7483ebf6d..e8a2e2c6e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.c
@@ -8,388 +8,246 @@
 #include <stdint.h>
 #include <string.h>
 #include "arith_backend.h"
-#include "cbd.h"
 #include "cbmc.h"
 #include "debug.h"
 #include "fips202x4.h"
-#include "ntt.h"
 #include "poly.h"
-#include "reduce.h"
+#include "sampling.h"
 #include "symmetric.h"
 #include "verify.h"
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j <= MLKEM_N / 4))
-  {
-    unsigned k;
-    uint16_t t[4];
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(forall(r, 0, k, t[r] < (1u << 10))))
-    {
-      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
-    }
-
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 10-bit in size.
-     */
-    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
-    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
-    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
-    r[5 * j + 4] = (t[3] >> 2);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
-  {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
+#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
+#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
+#define fqmul MLKEM_NAMESPACE(fqmul)
+#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
+#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
+#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
+#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
+#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
+#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        cast_uint16_to_int16
+ *
+ * Description: Cast uint16 value to int16
+ *
+ * Returns:
+ *   input x in     0 .. 32767: returns value unchanged
+ *   input x in 32768 .. 65535: returns (x - 65536)
+ **************************************************/
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+ALWAYS_INLINE
+static INLINE int16_t cast_uint16_to_int16(uint16_t x)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 4)
-    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[4];
-    uint8_t const *base = &a[5 * j];
-
-    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
-    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
-    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
-
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  /*
+   * PORTABILITY: This relies on uint16_t -> int16_t
+   * being implemented as the inverse of int16_t -> uint16_t,
+   * which is implementation-defined (C99 6.3.1.3 (3))
+   * CBMC (correctly) fails to prove this conversion is OK,
+   * so we have to suppress that check here
+   */
+  return (int16_t)x;
 }
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+/*************************************************
+ * Name:        montgomery_reduce_generic
+ *
+ * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
+ *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
+ *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
+ *
+ **************************************************/
+ALWAYS_INLINE
+static INLINE int16_t montgomery_reduce_generic(int32_t a)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
+  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 32)))
-    {
-      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
-    }
+  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
+  const uint16_t a_reduced = a & UINT16_MAX;
+  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
 
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC, and use array indexing into
-     * r rather than pointer-arithmetic to simplify verification
-     */
-    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
-    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
-    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
-    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
-    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
-  }
-}
+  /* Lift to signed canonical representative mod 2^16. */
+  const int16_t t = cast_uint16_to_int16(a_inverted);
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  int32_t r = a - ((int32_t)t * MLKEM_Q);
+  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
 
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j <= MLKEM_N / 8))
-  {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
-    {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
-    }
+  /*
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  r = r >> 16;
+  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
+   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
+   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *
+   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
+   */
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
-  }
+  return (int16_t)r;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+/*************************************************
+ * Name:        montgomery_reduce
+ *
+ * Description: Montgomery reduction
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q,
+ *              smaller than 2 * q in absolute value.
+ **************************************************/
+static INLINE int16_t montgomery_reduce(int32_t a)
+__contract__(
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
+  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
+)
 {
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    uint8_t t[8];
-    const unsigned offset = i * 5;
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC and unwind loop for ease
-     * of proof.
-     */
-
-    /*
-     * Decompress 5 8-bit bytes (so 40 bits) into
-     * 8 5-bit values stored in t[]
-     */
-    t[0] = 0x1F & (a[offset + 0] >> 0);
-    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
-    t[2] = 0x1F & (a[offset + 1] >> 2);
-    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
-    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
-    t[5] = 0x1F & (a[offset + 3] >> 1);
-    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
-    t[7] = 0x1F & (a[offset + 4] >> 3);
-
-    /* and copy to the correct slice in r[] */
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(j <= 8 && i <= MLKEM_N / 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  int16_t res;
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
+
+  res = montgomery_reduce_generic(a);
+  /* Bounds:
+   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
+   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
+   *        < 2 * MLKEM_Q */
+
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
+  return res;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+    !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        fqmul
+ *
+ * Description: Montgomery multiplication modulo q=3329
+ *
+ * Arguments:   - int16_t a: first factor
+ *                  Can be any int16_t.
+ *              - int16_t b: second factor.
+ *                  Must be signed canonical (abs value <(q+1)/2)
+ *
+ * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
+ * smaller than q in absolute value.
+ *
+ **************************************************/
+static INLINE int16_t fqmul(int16_t a, int16_t b)
+__contract__(
+  requires(b > -HALF_Q)
+  requires(b < HALF_Q)
+  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
+)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
-  }
+  int16_t res;
+  debug_assert_abs_bound(&b, 1, HALF_Q);
+
+  res = montgomery_reduce((int32_t)a * (int32_t)b);
+  /* Bounds:
+   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
+   *        < MLKEM_Q
+   */
 
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
+  return res;
 }
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
-
-#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+          !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+          !defined(MLKEM_USE_NATIVE_NTT) ||                   \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        barrett_reduce
+ *
+ * Description: Barrett reduction; given a 16-bit integer a, computes
+ *              centered representative congruent to a mod q in
+ *              {-(q-1)/2,...,(q-1)/2}
+ *
+ * Arguments:   - int16_t a: input integer to be reduced
+ *
+ * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
+ **************************************************/
+static INLINE int16_t barrett_reduce(int16_t a)
+__contract__(
+  ensures(return_value > -HALF_Q && return_value < HALF_Q)
+)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i <= MLKEM_N / 2))
-  {
-    const uint16_t t0 = a->coeffs[2 * i];
-    const uint16_t t1 = a->coeffs[2 * i + 1];
-    /*
-     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
-     * significant data, so these can be packed into 24 bits or exactly
-     * 3 bytes, as follows.
-     */
-
-    /* Least significant bits 0 - 7 of t0. */
-    r[3 * i + 0] = t0 & 0xFF;
-
-    /*
-     * Most significant bits 8 - 11 of t0 become the least significant
-     * nibble of the second byte. The least significant 4 bits
-     * of t1 become the upper nibble of the second byte.
-     */
-    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+  /*
+   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
+   * multiplier is round_to_nearest(2**26/MLKEM_Q)
+   */
+  const int BPOWER = 26;
+  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
 
-    /* Bits 4 - 11 of t1 become the third byte. */
-    r[3 * i + 2] = t1 >> 4;
-  }
-}
-#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-{
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  poly_tobytes_native(r, a);
-}
-#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+  /*
+   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
+   * above and shift by BPOWER places.
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
 
-#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
-  {
-    const uint8_t t0 = a[3 * i + 0];
-    const uint8_t t1 = a[3 * i + 1];
-    const uint8_t t2 = a[3 * i + 2];
-    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
-    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
-  }
+  /*
+   * t is in -10 .. +10, so we need 32-bit math to
+   * evaluate t * MLKEM_Q and the subsequent subtraction
+   */
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
 
-  /* Note that the coefficients are not canonical */
-  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
-}
-#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  poly_frombytes_native(r, a);
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
-#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
+                           int16_t b_cached)
+__contract__(
+  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
+  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
+  assigns(memory_slice(r, 2 * sizeof(int16_t)))
+  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)))
 {
-  unsigned i;
-#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
-#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
-#endif
+  int32_t t0, t1;
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <  MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      /* Prevent the compiler from recognizing this as a bit selection */
-      uint8_t mask = value_barrier_u8(1u << j);
-      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
-    }
-  }
-  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
-}
+  t0 = (int32_t)a[1] * b_cached;
+  t0 += (int32_t)a[0] * b[0];
+  t1 = (int32_t)a[0] * b[1];
+  t1 += (int32_t)a[1] * b[0];
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* |ti| < 2 * q * 2^15 */
+  r[0] = montgomery_reduce(t0);
+  r[1] = montgomery_reduce(t1);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    msg[i] = 0;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8))
-    {
-      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
-      msg[i] |= t << j;
-    }
-  }
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -434,12 +292,46 @@ void poly_tomont(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
-  poly_tomont_native(r);
+  poly_tomont_native(r->coeffs);
   debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
 #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/************************************************************
+ * Name: scalar_signed_to_unsigned_q
+ *
+ * Description: converts signed polynomial coefficient
+ *              from signed (-3328 .. 3328) form to
+ *              unsigned form (0 .. 3328).
+ *
+ * Note: Cryptographic constant time implementation
+ *
+ * Examples:       0 -> 0
+ *                 1 -> 1
+ *              3328 -> 3328
+ *                -1 -> 3328
+ *                -2 -> 3327
+ *             -3328 -> 1
+ *
+ * Arguments: c: signed coefficient to be converted
+ ************************************************************/
+static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
+__contract__(
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
+  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
+{
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
+  /* Add Q if c is negative, but in constant time */
+  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
+
+  /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
+  return (uint16_t)c;
+}
+
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
@@ -461,7 +353,7 @@ void poly_reduce(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
-  poly_reduce_native(r);
+  poly_reduce_native(r->coeffs);
   debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
@@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
-  poly_mulcache_compute_native(x, a);
+  poly_mulcache_compute_native(x->coeffs, a->coeffs);
   /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 
+#if !defined(MLKEM_USE_NATIVE_NTT)
+/*
+ * Computes a block CT butterflies with a fixed twiddle factor,
+ * using Montgomery multiplication.
+ * Parameters:
+ * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
+ * - root: Twiddle factor to use for the butterfly. This must be in
+ *         Montgomery form and signed canonical.
+ * - start: Offset to the beginning of the butterfly block
+ * - len: Index difference between coefficients subject to a butterfly
+ * - bound: Ghost variable describing coefficient bound: Prior to `start`,
+ *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
+ *          they must be bound by `bound`.
+ * When this function returns, output coefficients in the index range
+ * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
+ * Example:
+ * - start=8, len=4
+ *   This would compute the following four butterflies
+ *          8     --    12
+ *             9    --     13
+ *                10   --     14
+ *                   11   --     15
+ * - start=4, len=2
+ *   This would compute the following two butterflies
+ *          4 -- 6
+ *             5 -- 7
+ */
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
+__contract__(
+  requires(start < MLKEM_N)
+  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
+  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
+  requires(-HALF_Q < zeta && zeta < HALF_Q)
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
+  requires(array_abs_bound(r, start, MLKEM_N, bound))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
+  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
+{
+  /* `bound` is a ghost variable only needed in the CBMC specification */
+  unsigned j;
+  ((void)bound);
+  for (j = start; j < start + len; j++)
+  __loop__(
+    invariant(start <= j && j <= start + len)
+    /*
+     * Coefficients are updated in strided pairs, so the bounds for the
+     * intermediate states alternate twice between the old and new bound
+     */
+    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j,           start + len, bound))
+    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
+  {
+    int16_t t;
+    t = fqmul(r[j + len], zeta);
+    r[j + len] = r[j] - t;
+    r[j] = r[j] + t;
+  }
+}
+
+/*
+ *Compute one layer of forward NTT
+ * Parameters:
+ * - r: Pointer to base of polynomial
+ * - len: Stride of butterflies in this layer.
+ * - layer: Ghost variable indicating which layer is being applied.
+ *          Must match `len` via `len == MLKEM_N >> layer`.
+ * Note: `len` could be dropped and computed in the function, but
+ *   we are following the structure of the reference NTT from the
+ *   official Kyber implementation here, merely adding `layer` as
+ *   a ghost variable for the specifications.
+ */
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
+  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable only needed in the CBMC specification */
+  ((void)layer);
+  /* Twiddle factors for layer n start at index 2^(layer-1) */
+  k = MLKEM_N / (2 * len);
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
+    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
+  {
+    int16_t zeta = zetas[k++];
+    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
+  }
+}
+
+/*
+ * Compute full forward NTT
+ * NOTE: This particular implementation satisfies a much tighter
+ * bound on the output coefficients (5*q) than the contractual one (8*q),
+ * but this is not needed in the calling code. Should we change the
+ * base multiplication strategy to require smaller NTT output bounds,
+ * the proof may need strengthening.
+ */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  unsigned len, layer;
+  int16_t *r;
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  r = p->coeffs;
+
+  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
+  __loop__(
+    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
+    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
+  {
+    ntt_layer(r, len, layer);
+  }
+
+  /* Check the stronger bound */
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_NTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  ntt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if !defined(MLKEM_USE_NATIVE_INTT)
+
+/* Compute one layer of inverse NTT */
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
+  requires(len == (1 << (8 - layer)))
+  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable used only in the specification */
+  ((void)layer);
+  k = MLKEM_N / len - 1;
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+    invariant(start <= MLKEM_N && k <= 127)
+    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
+    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
+  {
+    unsigned j;
+    int16_t zeta = zetas[k--];
+    for (j = start; j < start + len; j++)
+    __loop__(
+      invariant(start <= j && j <= start + len)
+      invariant(start <= MLKEM_N && k <= 127)
+      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+    {
+      int16_t t = r[j];
+      r[j] = barrett_reduce(t + r[j + len]);
+      r[j + len] = r[j + len] - t;
+      r[j + len] = fqmul(r[j + len], zeta);
+    }
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  /*
+   * Scale input polynomial to account for Montgomery factor
+   * and NTT twist. This also brings coefficients down to
+   * absolute value < MLKEM_Q.
+   */
+  unsigned j, len, layer;
+  const int16_t f = 1441;
+  int16_t *r = p->coeffs;
+
+  for (j = 0; j < MLKEM_N; j++)
+  __loop__(
+    invariant(j <= MLKEM_N)
+    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
+  {
+    r[j] = fqmul(r[j], f);
+  }
+
+  /* Run the invNTT layers */
+  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
+  __loop__(
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+  {
+    invntt_layer(p->coeffs, len, layer);
+  }
+
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_INTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  intt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_INTT */
+
 #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h
index 6a14c785d..cb0d67c1a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly.h
@@ -9,7 +9,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "reduce.h"
+#include "debug.h"
 #include "verify.h"
 
 /* Absolute exclusive upper bound for the output of the inverse NTT */
@@ -18,6 +18,9 @@
 /* Absolute exclusive upper bound for the output of the forward NTT */
 #define NTT_BOUND (8 * MLKEM_Q)
 
+#define zetas MLKEM_NAMESPACE(zetas)
+extern const int16_t zetas[128];
+
 /*
  * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
  * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
@@ -38,520 +41,6 @@ typedef struct
   int16_t coeffs[MLKEM_N >> 1];
 } poly_mulcache;
 
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
-#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
-#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
-#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
-#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
-#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
-#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
-#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
-#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
-#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
-/* End of static namespacing */
-
-/************************************************************
- * Name: scalar_compress_d1
- *
- * Description: Computes round(u * 2 / q)
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 1.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d1(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 2)
-  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
-{
-  uint32_t d0 = u << 1;
-  d0 *= 645083;
-  d0 += 1u << 30;
-  d0 >>= 31;
-  return d0;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_compress_d4
- *
- * Description: Computes round(u * 16 / q) % 16
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d4(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 16)
-  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
-{
-  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
-  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d4
- *
- * Description: Computes round(u * q / 16)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d4(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 16)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 8) / 16; }
-
-/************************************************************
- * Name: scalar_compress_d5
- *
- * Description: Computes round(u * 32 / q) % 32
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d5(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 32)
-  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
-{
-  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
-  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d5
- *
- * Description: Computes round(u * q / 32)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 32
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d5(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 32)
-  ensures(return_value <= MLKEM_Q - 1)
-) { return ((u * MLKEM_Q) + 16) / 32; }
-
-/************************************************************
- * Name: scalar_compress_d10
- *
- * Description: Computes round(u * 2**10 / q) % 2**10
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d10(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 10))
-  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
-{
-  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x3FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d10
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d10(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 1024)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 512) / 1024; }
-
-/************************************************************
- * Name: scalar_compress_d11
- *
- * Description: Computes round(u * 2**11 / q) % 2**11
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 11.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d11(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 11))
-  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
-{
-  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x7FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d11
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d11(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 2048)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 1024) / 2048; }
-
-/************************************************************
- * Name: scalar_signed_to_unsigned_q
- *
- * Description: converts signed polynomial coefficient
- *              from signed (-3328 .. 3328) form to
- *              unsigned form (0 .. 3328).
- *
- * Note: Cryptographic constant time implementation
- *
- * Examples:       0 -> 0
- *                 1 -> 1
- *              3328 -> 3328
- *                -1 -> 3328
- *                -2 -> 3327
- *             -3328 -> 1
- *
- * Arguments: c: signed coefficient to be converted
- ************************************************************/
-static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
-__contract__(
-  requires(c > -MLKEM_Q && c < MLKEM_Q)
-  ensures(return_value >= 0 && return_value < MLKEM_Q)
-  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
-{
-  debug_assert_abs_bound(&c, 1, MLKEM_Q);
-
-  /* Add Q if c is negative, but in constant time */
-  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
-
-  /* and therefore cast to uint16_t is safe. */
-  debug_assert_bound(&c, 1, 0, MLKEM_Q);
-  return (uint16_t)c;
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
-    (MLKEM_K == 2 || MLKEM_K == 3)
-#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
-/*************************************************
- * Name:        poly_compress_d4
- *
- * Description: Compression (4 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
-
-#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
-/*************************************************
- * Name:        poly_compress_d10
- *
- * Description: Compression (10 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
-
-#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
-/*************************************************
- * Name:        poly_decompress_d4
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
-
-#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
-/*************************************************
- * Name:        poly_decompress_d10
- *
- * Description: De-serialization and subsequent decompression (10 bits) of a
- *              polynomial; approximate inverse of poly_compress_d10
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
-/*************************************************
- * Name:        poly_compress_d5
- *
- * Description: Compression (5 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
-
-#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
-/*************************************************
- * Name:        poly_compress_d11
- *
- * Description: Compression (11 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
-
-#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
-/*************************************************
- * Name:        poly_decompress_d5
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
-
-#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
-/*************************************************
- * Name:        poly_decompress_d11
- *
- * Description: De-serialization and subsequent decompression (11 bits) of a
- *              polynomial; approximate inverse of poly_compress_d11
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
-        */
-
-#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
-/*************************************************
- * Name:        poly_tobytes
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - a: const pointer to input polynomial,
- *                with each coefficient in the range [0,1,..,Q-1]
- *              OUTPUT
- *              - r: pointer to output byte array
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYBYTES))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
-
-
-#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
-/*************************************************
- * Name:        poly_frombytes
- *
- * Description: De-serialization of a polynomial.
- *
- * Arguments:   INPUT
- *              - a: pointer to input byte array
- *                   (of MLKEM_POLYBYTES bytes)
- *              OUTPUT
- *              - r: pointer to output polynomial, with
- *                   each coefficient unsigned and in the range
- *                   0 .. 4095
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
-);
-
-
-#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
-/*************************************************
- * Name:        poly_frommsg
- *
- * Description: Convert 32-byte message to polynomial
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *msg: pointer to input message
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
-
-#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
-/*************************************************
- * Name:        poly_tomsg
- *
- * Description: Convert polynomial to 32-byte message
- *
- * Arguments:   - uint8_t *msg: pointer to output message
- *              - const poly *r: pointer to input polynomial
- *                Coefficients must be unsigned canonical
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(msg))
-);
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -715,4 +204,56 @@ __contract__(
   assigns(object_whole(r))
 );
 
+#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
+/*************************************************
+ * Name:        poly_ntt
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ *              (NOTE: Sometimes the input to the NTT is actually smaller,
+ *               which gives better bounds.)
+ *
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
+);
+
+#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
+/*************************************************
+ * Name:        poly_invntt_tomont
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place;
+ *              inputs assumed to be in bitreversed order, output in normal
+ *              order
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
+);
+
 #endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly_k.c
similarity index 97%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly_k.c
index 50ea1c34a..c2d330ea9 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly_k.c
@@ -2,13 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include "polyvec.h"
+#include "poly_k.h"
 #include <stdint.h>
 #include <string.h>
 #include "arith_backend.h"
-#include "cbd.h"
-#include "ntt.h"
-#include "poly.h"
+#include "compress.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 #include "debug.h"
@@ -131,7 +130,9 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
   /* Omitting bounds assertion for cache since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
-  polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
+  polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a,
+                                               (const int16_t *)b,
+                                               (const int16_t *)b_cache);
 }
 #endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly_k.h
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly_k.h
index 8be8579e0..0aea95912 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/poly_k.h
@@ -2,11 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#ifndef POLYVEC_H
-#define POLYVEC_H
+#ifndef POLY_K_H
+#define POLY_K_H
 
 #include <stdint.h>
 #include "common.h"
+#include "compress.h"
 #include "poly.h"
 
 #define polyvec MLKEM_NAMESPACE_K(polyvec)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.h
deleted file mode 100644
index b432a4201..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/reduce.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "common.h"
-#include "debug.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
-#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
-#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
-#define fqmul MLKEM_NAMESPACE(fqmul)
-#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
-/* End of static namespacing */
-
-#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
-
-/*************************************************
- * Name:        cast_uint16_to_int16
- *
- * Description: Cast uint16 value to int16
- *
- * Returns:
- *   input x in     0 .. 32767: returns value unchanged
- *   input x in 32768 .. 65535: returns (x - 65536)
- **************************************************/
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
-ALWAYS_INLINE
-static INLINE int16_t cast_uint16_to_int16(uint16_t x)
-{
-  /*
-   * PORTABILITY: This relies on uint16_t -> int16_t
-   * being implemented as the inverse of int16_t -> uint16_t,
-   * which is implementation-defined (C99 6.3.1.3 (3))
-   * CBMC (correctly) fails to prove this conversion is OK,
-   * so we have to suppress that check here
-   */
-  return (int16_t)x;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/*************************************************
- * Name:        montgomery_reduce_generic
- *
- * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
- *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *
- * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
- *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
- *
- **************************************************/
-ALWAYS_INLINE
-static INLINE int16_t montgomery_reduce_generic(int32_t a)
-{
-  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
-  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
-
-  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
-  const uint16_t a_reduced = a & UINT16_MAX;
-  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
-
-  /* Lift to signed canonical representative mod 2^16. */
-  const int16_t t = cast_uint16_to_int16(a_inverted);
-
-  int32_t r = a - ((int32_t)t * MLKEM_Q);
-  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
-
-  /*
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  r = r >> 16;
-  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
-   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
-   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *
-   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
-   */
-
-  return (int16_t)r;
-}
-
-/*************************************************
- * Name:        montgomery_reduce
- *
- * Description: Montgomery reduction
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
- *
- * Returns:     integer congruent to a * R^-1 modulo q,
- *              smaller than 2 * q in absolute value.
- **************************************************/
-static INLINE int16_t montgomery_reduce(int32_t a)
-__contract__(
-  requires(a > -(2 * UINT12_LIMIT * 32768))
-  requires(a <  (2 * UINT12_LIMIT * 32768))
-  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
-
-  res = montgomery_reduce_generic(a);
-  /* Bounds:
-   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
-   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
-   *        < 2 * MLKEM_Q */
-
-  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        fqmul
- *
- * Description: Montgomery multiplication modulo q=3329
- *
- * Arguments:   - int16_t a: first factor
- *                  Can be any int16_t.
- *              - int16_t b: second factor.
- *                  Must be signed canonical (abs value <(q+1)/2)
- *
- * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
- * smaller than q in absolute value.
- *
- **************************************************/
-static INLINE int16_t fqmul(int16_t a, int16_t b)
-__contract__(
-  requires(b > -HALF_Q)
-  requires(b < HALF_Q)
-  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&b, 1, HALF_Q);
-
-  res = montgomery_reduce((int32_t)a * (int32_t)b);
-  /* Bounds:
-   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
-   *        < MLKEM_Q
-   */
-
-  debug_assert_abs_bound(&res, 1, MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        barrett_reduce
- *
- * Description: Barrett reduction; given a 16-bit integer a, computes
- *              centered representative congruent to a mod q in
- *              {-(q-1)/2,...,(q-1)/2}
- *
- * Arguments:   - int16_t a: input integer to be reduced
- *
- * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
- **************************************************/
-static INLINE int16_t barrett_reduce(int16_t a)
-__contract__(
-  ensures(return_value > -HALF_Q && return_value < HALF_Q)
-)
-{
-  /*
-   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
-   * multiplier is round_to_nearest(2**26/MLKEM_Q)
-   */
-  const int BPOWER = 26;
-  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
-
-  /*
-   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
-   * above and shift by BPOWER places.
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
-
-  /*
-   * t is in -10 .. +10, so we need 32-bit math to
-   * evaluate t * MLKEM_Q and the subsequent subtraction
-   */
-  int16_t res = (int16_t)(a - t * MLKEM_Q);
-
-  debug_assert_abs_bound(&res, 1, HALF_Q);
-  return res;
-}
-
-#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/sampling.c
similarity index 73%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/sampling.c
index cbbe4407f..98cbdcb74 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/sampling.c
@@ -9,7 +9,7 @@
 #include "debug.h"
 #include "fips202.h"
 #include "fips202x4.h"
-#include "rej_uniform.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 /* Static namespacing
@@ -18,6 +18,8 @@
  * within a single compilation unit. */
 #define rej_uniform MLKEM_NAMESPACE(rej_uniform)
 #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
 /* End of static namespacing */
 
 static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
@@ -233,9 +235,113 @@ void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
   xof_release(&state);
 }
 
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        load32_littleendian
+ *
+ * Description: load 4 bytes into a 32-bit integer
+ *              in little-endian order
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x
+ **************************************************/
+static uint32_t load32_littleendian(const uint8_t x[4])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  r |= (uint32_t)x[3] << 24;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
+  {
+    unsigned j;
+    uint32_t t = load32_littleendian(buf + 4 * i);
+    uint32_t d = t & 0x55555555;
+    d += (t >> 1) & 0x55555555;
+
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
+    {
+      const int16_t a = (d >> (4 * j + 0)) & 0x3;
+      const int16_t b = (d >> (4 * j + 2)) & 0x3;
+      r->coeffs[8 * i + j] = a - b;
+    }
+  }
+}
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+/*************************************************
+ * Name:        load24_littleendian
+ *
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+ **************************************************/
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
+  {
+    unsigned j;
+    const uint32_t t = load24_littleendian(buf + 3 * i);
+    uint32_t d = t & 0x00249249;
+    d += (t >> 1) & 0x00249249;
+    d += (t >> 2) & 0x00249249;
+
+    for (j = 0; j < 4; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 4 && j <= 4)
+      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
+    {
+      const int16_t a = (d >> (6 * j + 0)) & 0x7;
+      const int16_t b = (d >> (6 * j + 3)) & 0x7;
+      r->coeffs[4 * i + j] = a - b;
+    }
+  }
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
+
 #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
-int empty_cu_rej_uniform;
+#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling)
+int empty_cu_sampling;
 
 #endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/sampling.h
similarity index 63%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/sampling.h
index 801287259..cc524e0fc 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/rej_uniform.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/sampling.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#ifndef REJ_UNIFORM_H
-#define REJ_UNIFORM_H
+#ifndef SAMPLING_H
+#define SAMPLING_H
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -11,6 +11,37 @@
 #include "common.h"
 #include "poly.h"
 
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
+/*************************************************
+ * Name:        poly_cbd2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
+/*************************************************
+ * Name:        poly_cbd3
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
+
 #define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
 /*************************************************
  * Name:        poly_rej_uniform_x4
@@ -60,4 +91,4 @@ __contract__(
   assigns(memory_slice(entry, sizeof(poly)))
   ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
 
-#endif /* REJ_UNIFORM_H */
+#endif /* SAMPLING_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c
index 4ef887c62..987f0dce4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c
@@ -10,7 +10,7 @@
 
 #include "common.h"
 #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-#include "ntt.h"
+#include "poly.h"
 
 /*
  * Table of zeta values used in the reference NTT and inverse NTT.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/arith_backend.h
index 0543b1bd1..ade31cda1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/arith_backend.h
@@ -17,7 +17,7 @@
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
 #if defined(MLKEM_NATIVE_CHECK_APIS)
-#include "api.h"
+#include "native/api.h"
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.c
deleted file mode 100644
index 1e6b7c5d1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
-
-#include <stdint.h>
-#include "cbd.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
-#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-/* End of static namespacing */
-
-/*************************************************
- * Name:        load32_littleendian
- *
- * Description: load 4 bytes into a 32-bit integer
- *              in little-endian order
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x
- **************************************************/
-static uint32_t load32_littleendian(const uint8_t x[4])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  r |= (uint32_t)x[3] << 24;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
-  {
-    unsigned j;
-    uint32_t t = load32_littleendian(buf + 4 * i);
-    uint32_t d = t & 0x55555555;
-    d += (t >> 1) & 0x55555555;
-
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
-    {
-      const int16_t a = (d >> (4 * j + 0)) & 0x3;
-      const int16_t b = (d >> (4 * j + 2)) & 0x3;
-      r->coeffs[8 * i + j] = a - b;
-    }
-  }
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 4)
-    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
-  {
-    unsigned j;
-    const uint32_t t = load24_littleendian(buf + 3 * i);
-    uint32_t d = t & 0x00249249;
-    d += (t >> 1) & 0x00249249;
-    d += (t >> 2) & 0x00249249;
-
-    for (j = 0; j < 4; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 4 && j <= 4)
-      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
-    {
-      const int16_t a = (d >> (6 * j + 0)) & 0x7;
-      const int16_t b = (d >> (6 * j + 3)) & 0x7;
-      r->coeffs[4 * i + j] = a - b;
-    }
-  }
-}
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
-          3 */
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
-int empty_cu_cbd;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.h
deleted file mode 100644
index 54c1f5b90..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/cbd.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef CBD_H
-#define CBD_H
-
-#include <stdint.h>
-#include "common.h"
-#include "poly.h"
-
-#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
-/*************************************************
- * Name:        poly_cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
-/*************************************************
- * Name:        poly_cbd3
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
-
-#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h
index 4f326333e..62ed53ab1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h
@@ -15,12 +15,19 @@
 #include "sys.h"
 
 /* Include backend metadata */
-#if defined(MLKEM_USE_NATIVE)
-#if defined(MLKEM_NATIVE_ARITH_BACKEND)
-#include MLKEM_NATIVE_ARITH_BACKEND
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#include MLKEM_NATIVE_ARITH_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not.
+#endif
 #endif
-#if defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#include MLKEM_NATIVE_FIPS202_BACKEND
+
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#include MLKEM_NATIVE_FIPS202_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not.
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/compress.c
new file mode 100644
index 000000000..a03fe0ac4
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/compress.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
+#include <stdint.h>
+#include <string.h>
+#include "arith_backend.h"
+#include "cbmc.h"
+#include "compress.h"
+#include "debug.h"
+#include "verify.h"
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
+    {
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
+    }
+
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(invariant(j <= MLKEM_N / 4))
+  {
+    unsigned k;
+    uint16_t t[4];
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(forall(r, 0, k, t[r] < (1u << 10))))
+    {
+      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 10-bit in size.
+     */
+    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
+    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
+    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
+    r[5 * j + 4] = (t[3] >> 2);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  {
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 4)
+    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[4];
+    uint8_t const *base = &a[5 * j];
+
+    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
+    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
+    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
+
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 32)))
+    {
+      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
+    }
+
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC, and use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     */
+    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
+    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
+    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
+  {
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    uint8_t t[8];
+    const unsigned offset = i * 5;
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC and unwind loop for ease
+     * of proof.
+     */
+
+    /*
+     * Decompress 5 8-bit bytes (so 40 bits) into
+     * 8 5-bit values stored in t[]
+     */
+    t[0] = 0x1F & (a[offset + 0] >> 0);
+    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
+    t[2] = 0x1F & (a[offset + 1] >> 2);
+    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
+    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
+    t[5] = 0x1F & (a[offset + 3] >> 1);
+    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
+    t[7] = 0x1F & (a[offset + 4] >> 3);
+
+    /* and copy to the correct slice in r[] */
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(j <= 8 && i <= MLKEM_N / 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(invariant(i <= MLKEM_N / 2))
+  {
+    const uint16_t t0 = a->coeffs[2 * i];
+    const uint16_t t1 = a->coeffs[2 * i + 1];
+    /*
+     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
+     * significant data, so these can be packed into 24 bits or exactly
+     * 3 bytes, as follows.
+     */
+
+    /* Least significant bits 0 - 7 of t0. */
+    r[3 * i + 0] = t0 & 0xFF;
+
+    /*
+     * Most significant bits 8 - 11 of t0 become the least significant
+     * nibble of the second byte. The least significant 4 bits
+     * of t1 become the upper nibble of the second byte.
+     */
+    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+
+    /* Bits 4 - 11 of t1 become the third byte. */
+    r[3 * i + 2] = t1 >> 4;
+  }
+}
+#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  poly_tobytes_native(r, a->coeffs);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
+  {
+    const uint8_t t0 = a[3 * i + 0];
+    const uint8_t t1 = a[3 * i + 1];
+    const uint8_t t2 = a[3 * i + 2];
+    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
+    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+  }
+
+  /* Note that the coefficients are not canonical */
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  poly_frombytes_native(r->coeffs, a);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+{
+  unsigned i;
+#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
+#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
+#endif
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <  MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      /* Prevent the compiler from recognizing this as a bit selection */
+      uint8_t mask = value_barrier_u8(1u << j);
+      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
+    }
+  }
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    msg[i] = 0;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8))
+    {
+      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
+      msg[i] |= t << j;
+    }
+  }
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress)
+int empty_cu_compress;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/compress.h
new file mode 100644
index 000000000..409dbe519
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/compress.h
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef COMPRESS_H
+#define COMPRESS_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+#include "debug.h"
+#include "poly.h"
+#include "verify.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
+#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
+#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
+#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
+#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
+#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
+#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
+#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
+#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
+/* End of static namespacing */
+
+/************************************************************
+ * Name: scalar_compress_d1
+ *
+ * Description: Computes round(u * 2 / q)
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 1.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d1(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 2)
+  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
+{
+  uint32_t d0 = u << 1;
+  d0 *= 645083;
+  d0 += 1u << 30;
+  d0 >>= 31;
+  return d0;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_compress_d4
+ *
+ * Description: Computes round(u * 16 / q) % 16
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d4(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 16)
+  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
+{
+  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
+  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d4
+ *
+ * Description: Computes round(u * q / 16)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d4(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 16)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 8) / 16; }
+
+/************************************************************
+ * Name: scalar_compress_d5
+ *
+ * Description: Computes round(u * 32 / q) % 32
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d5(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 32)
+  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
+{
+  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
+  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d5
+ *
+ * Description: Computes round(u * q / 32)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 32
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d5(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 32)
+  ensures(return_value <= MLKEM_Q - 1)
+) { return ((u * MLKEM_Q) + 16) / 32; }
+
+/************************************************************
+ * Name: scalar_compress_d10
+ *
+ * Description: Computes round(u * 2**10 / q) % 2**10
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d10(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 10))
+  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
+{
+  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x3FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d10
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d10(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 1024)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 512) / 1024; }
+
+/************************************************************
+ * Name: scalar_compress_d11
+ *
+ * Description: Computes round(u * 2**11 / q) % 2**11
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 11.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d11(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 11))
+  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
+{
+  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x7FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d11
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d11(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 2048)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 1024) / 2048; }
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
+/*************************************************
+ * Name:        poly_compress_d4
+ *
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
+
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
+/*************************************************
+ * Name:        poly_decompress_d4
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
+/*************************************************
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
+
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
+/*************************************************
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
+
+#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
+/*************************************************
+ * Name:        poly_tobytes
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range [0,1,..,Q-1]
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYBYTES))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r))
+);
+
+
+#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
+/*************************************************
+ * Name:        poly_frombytes
+ *
+ * Description: De-serialization of a polynomial.
+ *
+ * Arguments:   INPUT
+ *              - a: pointer to input byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ *              OUTPUT
+ *              - r: pointer to output polynomial, with
+ *                   each coefficient unsigned and in the range
+ *                   0 .. 4095
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
+);
+
+
+#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
+/*************************************************
+ * Name:        poly_frommsg
+ *
+ * Description: Convert 32-byte message to polynomial
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *msg: pointer to input message
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
+#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
+/*************************************************
+ * Name:        poly_tomsg
+ *
+ * Description: Convert polynomial to 32-byte message
+ *
+ * Arguments:   - uint8_t *msg: pointer to output message
+ *              - const poly *r: pointer to input polynomial
+ *                Coefficients must be unsigned canonical
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(msg))
+);
+
+#endif /* COMPRESS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h
index fa89370ce..e975ede95 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h
@@ -122,46 +122,87 @@
 /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
- * Name:        MLKEM_USE_NATIVE
+ * Name:        MLKEM_USE_NATIVE_BACKEND_ARITH
  *
- * Description: Determines whether a native backend should
- *              be used, if available.
+ * Description: Determines whether an native arithmetic backend should be used.
+ *
+ *              The arithmetic backend covers performance critical functions
+ *              such as the number-theoretic transform (NTT).
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the arithmetic backend to be use is
+ *              determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(MLKEM_USE_NATIVE)
-/* #define MLKEM_USE_NATIVE */
+#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */
 #endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_ARITH_BACKEND
+ * Name:        MLKEM_NATIVE_ARITH_BACKEND_FILE
  *
  * Description: The arithmetic backend to use.
  *
- *              This must be the filename of an arithmetic backend.
- *              See the existing backends for examples.
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option
+ *              is ignored.
+ *
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must
+ *              either be undefined or the filename of an arithmetic backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND)
-#define MLKEM_NATIVE_ARITH_BACKEND "default.h"
-#endif /* MLKEM_NATIVE_ARITH_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \
+    !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h"
+#endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_FIPS202_BACKEND
+ * Name:        MLKEM_USE_NATIVE_BACKEND_FIPS202
+ *
+ * Description: Determines whether an native FIPS202 backend should be used.
+ *
+ *              The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is
+ *              the performance bottleneck of SHA3 and SHAKE.
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the FIPS202 backend to be use is
+ *              determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */
+#endif
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_FIPS202_BACKEND_FILE
  *
  * Description: The FIPS-202 backend to use.
  *
- *              This must be the filename of an FIPS-202 backend.
+ *              If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must
+ *              either be undefined or the filename of a FIPS202 backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h"
-#endif /* MLKEM_NATIVE_FIPS202_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \
+    !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h"
+#endif
 
 /*************************  Config internals  ********************************/
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.c
index 0cfcc3e9e..318d0fc77 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.c
@@ -9,11 +9,10 @@
 #include "fips202.h"
 #include "fips202x4.h"
 #include "indcpa.h"
-#include "ntt.h"
 #include "poly.h"
-#include "polyvec.h"
+#include "poly_k.h"
 #include "randombytes.h"
-#include "rej_uniform.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 #include "arith_backend.h"
@@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v,
 #define poly_permute_bitrev_to_custom \
   MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
-static INLINE void poly_permute_bitrev_to_custom(poly *data)
+static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
 __contract__(
   /* We don't specify that this should be a permutation, but only
    * that it does not change the bound established at the end of gen_matrix. */
-  requires(memory_no_alias(data, sizeof(poly)))
-  requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
+  requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
   assigns(memory_slice(data, sizeof(poly)))
-  ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
+  ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
 #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
 
 /* Not static for benchmarking */
@@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
   {
     for (j = 0; j < MLKEM_K; j++)
     {
-      poly_permute_bitrev_to_custom(&a[i].vec[j]);
+      poly_permute_bitrev_to_custom(a[i].vec[j].coeffs);
     }
   }
 }
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.h
index 2c4fda3c4..b4d5985bf 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/indcpa.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "polyvec.h"
+#include "poly_k.h"
 
 #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/api.h
similarity index 90%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/api.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/api.h
index 792ecb8a4..0704f9dcd 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/api.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/api.h
@@ -23,8 +23,7 @@
 #define MLKEM_NATIVE_ARITH_NATIVE_API_H
 
 #include <stdint.h>
-#include "poly.h"
-#include "polyvec.h"
+#include "../common.h"
 
 /*
  * This is the C<->native interface allowing for the drop-in of
@@ -65,9 +64,9 @@
  *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
  *              for more information.
  *
- * Arguments:   - poly *p: pointer to in/output polynomial
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
  **************************************************/
-static INLINE void ntt_native(poly *);
+static INLINE void ntt_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_NTT */
 
 #if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
@@ -96,10 +95,10 @@ and to/from bytes conversions."
  *
  *              This must only be defined if there is native code for
  *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
- * Arguments:   - poly *p: pointer to in/output polynomial
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
  *
  **************************************************/
-static INLINE void poly_permute_bitrev_to_custom(poly *);
+static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
 
 #if defined(MLKEM_USE_NATIVE_INTT)
@@ -117,7 +116,7 @@ static INLINE void poly_permute_bitrev_to_custom(poly *);
  *
  * Arguments:   - uint16_t *a: pointer to in/output polynomial
  **************************************************/
-static INLINE void intt_native(poly *);
+static INLINE void intt_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_INTT */
 
 #if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
@@ -126,9 +125,9 @@ static INLINE void intt_native(poly *);
  *
  * Description: Applies modular reduction to all coefficients of a polynomial.
  *
- * Arguments:   - poly *r: pointer to input/output polynomial
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
  **************************************************/
-static INLINE void poly_reduce_native(poly *);
+static INLINE void poly_reduce_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
 
 #if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
@@ -138,9 +137,9 @@ static INLINE void poly_reduce_native(poly *);
  * Description: Inplace conversion of all coefficients of a polynomial
  *              from normal domain to Montgomery domain
  *
- * Arguments:   - poly *r: pointer to input/output polynomial
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
  **************************************************/
-static INLINE void poly_tomont_native(poly *);
+static INLINE void poly_tomont_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
 #if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
@@ -165,8 +164,8 @@ static INLINE void poly_tomont_native(poly *);
  *              OUTPUT
  *              - cache: pointer to multiplication cache
  **************************************************/
-static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
-                                                const poly *poly);
+static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2],
+                                                const int16_t poly[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 
 #if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
@@ -189,8 +188,9 @@ static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
  *                   in NTT domain, and of the same order as a and b.
  **************************************************/
 static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache);
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]);
 #endif
 
 #if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
@@ -209,7 +209,7 @@ static INLINE void polyvec_basemul_acc_montgomery_cached_native(
  *                   (of MLKEM_POLYBYTES bytes)
  **************************************************/
 static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a);
+                                       const int16_t a[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
 
 #if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
@@ -226,7 +226,7 @@ static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
  *              - a: const pointer to input byte aray
  *                   (of MLKEM_POLYBYTES bytes)
  **************************************************/
-static INLINE void poly_frombytes_native(poly *a,
+static INLINE void poly_frombytes_native(int16_t a[MLKEM_N],
                                          const uint8_t r[MLKEM_POLYBYTES]);
 #endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/default.h
similarity index 97%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/default.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/default.h
index d1e41c52e..f9fe4310a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/default.h
@@ -8,7 +8,7 @@
 /*
  * Default arithmetic backend
  */
-#include "sys.h"
+#include "../sys.h"
 
 #ifdef SYS_AARCH64
 /*
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/README.md
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/README.md
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/README.md
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/default.h
similarity index 90%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/default.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/default.h
index 592e8996d..73f53dc13 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/default.h
@@ -19,6 +19,6 @@
 /* Filename of the C backend implementation.
  * This is not inlined here because this header is included in assembly
  * files as well. */
-#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "x86_64/src/default_impl.h"
+#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/x86_64/src/default_impl.h"
 
 #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/align.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/align.h
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/align.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/align.h
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/arith_native_x86_64.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/arith_native_x86_64.h
similarity index 91%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/arith_native_x86_64.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/arith_native_x86_64.h
index 25e00a930..acde977ad 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/arith_native_x86_64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/arith_native_x86_64.h
@@ -5,11 +5,10 @@
 #ifndef MLKEM_X86_64_NATIVE_H
 #define MLKEM_X86_64_NATIVE_H
 
-#include "common.h"
+#include "../../../common.h"
 
 #include <immintrin.h>
 #include <stdint.h>
-#include "polyvec.h"
 #include "consts.h"
 
 #define REJ_UNIFORM_AVX_NBLOCKS 3 /* See MLKEM_GEN_MATRIX_NBLOCKS */
@@ -44,8 +43,9 @@ void basemul_avx2(__m256i *r, const __m256i *a, const __m256i *b,
 #define polyvec_basemul_acc_montgomery_cached_avx2 \
   MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_avx2)
 void polyvec_basemul_acc_montgomery_cached_avx2(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache);
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]);
 
 #define ntttobytes_avx2 MLKEM_NAMESPACE(ntttobytes_avx2)
 void ntttobytes_avx2(uint8_t *r, const __m256i *a, const __m256i *qdata);
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S
index b97840e70..5fdc3d0a0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S
@@ -6,7 +6,7 @@
 // Implementation from Kyber reference repository
 // https://github.com/pq-crystals/kyber/blob/main/avx2
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
 #include "consts.h"
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.c
similarity index 51%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.c
index 5f9ae99c8..970938306 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.c
@@ -3,46 +3,46 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
-#include "poly.h"
-#include "polyvec.h"
-
 #include "arith_native_x86_64.h"
 #include "consts.h"
 
-static void poly_basemul_montgomery_avx2(poly *r, const poly *a, const poly *b)
+static void poly_basemul_montgomery_avx2(int16_t r[MLKEM_N],
+                                         const int16_t a[MLKEM_N],
+                                         const int16_t b[MLKEM_N])
 {
-  basemul_avx2((__m256i *)r->coeffs, (const __m256i *)a->coeffs,
-               (const __m256i *)b->coeffs, qdata.vec);
+  basemul_avx2((__m256i *)r, (const __m256i *)a, (const __m256i *)b, qdata.vec);
 }
 
 /*
  * Implementation from Kyber reference repository
  * https://github.com/pq-crystals/kyber/blob/main/avx2
  */
-static void poly_add_avx2(poly *r, const poly *a, const poly *b)
+static void poly_add_avx2(int16_t r[MLKEM_N], const int16_t a[MLKEM_N],
+                          const int16_t b[MLKEM_N])
 {
   unsigned i;
   __m256i f0, f1;
 
   for (i = 0; i < MLKEM_N; i += 16)
   {
-    f0 = _mm256_load_si256((const __m256i *)&a->coeffs[i]);
-    f1 = _mm256_load_si256((const __m256i *)&b->coeffs[i]);
+    f0 = _mm256_load_si256((const __m256i *)&a[i]);
+    f1 = _mm256_load_si256((const __m256i *)&b[i]);
     f0 = _mm256_add_epi16(f0, f1);
-    _mm256_store_si256((__m256i *)&r->coeffs[i], f0);
+    _mm256_store_si256((__m256i *)&r[i], f0);
   }
 }
 
-void polyvec_basemul_acc_montgomery_cached_avx2(poly *r, const polyvec *a,
-                                                const polyvec *b,
-                                                const polyvec_mulcache *b_cache)
+void polyvec_basemul_acc_montgomery_cached_avx2(
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)])
 {
   unsigned i;
-  poly t;
+  int16_t t[MLKEM_N] ALIGN;
 
   /* TODO: Use mulcache for AVX2. So far, it is unused. */
   ((void)b_cache);
@@ -50,11 +50,11 @@ void polyvec_basemul_acc_montgomery_cached_avx2(poly *r, const polyvec *a,
   /* Coefficient-wise bound of each basemul is 2q.
    * Since we are accumulating at most 4 times, the
    * overall bound is 8q < INT16_MAX. */
-  poly_basemul_montgomery_avx2(r, &a->vec[0], &b->vec[0]);
+  poly_basemul_montgomery_avx2(r, &a[0], &b[0]);
   for (i = 1; i < MLKEM_K; i++)
   {
-    poly_basemul_montgomery_avx2(&t, &a->vec[i], &b->vec[i]);
-    poly_add_avx2(r, r, &t);
+    poly_basemul_montgomery_avx2(t, &a[i * MLKEM_N], &b[i * MLKEM_N]);
+    poly_add_avx2(r, r, t);
   }
 }
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.c
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.c
index 86a0835ef..568752ae8 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.c
@@ -8,7 +8,7 @@
  * https://github.com/pq-crystals/kyber/blob/main/avx2/consts.c
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.h
similarity index 97%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.h
index 00c415952..e2846b609 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.h
@@ -11,7 +11,7 @@
 #ifndef CONSTS_H
 #define CONSTS_H
 
-#include "common.h"
+#include "../../../common.h"
 
 #define AVX2_BACKEND_DATA_OFFSET_16XQ 0
 #define AVX2_BACKEND_DATA_OFFSET_16XQINV 16
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/default_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/default_impl.h
similarity index 62%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/default_impl.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/default_impl.h
index 029111c17..3683361e2 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/default_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/default_impl.h
@@ -12,8 +12,7 @@
 
 #include <string.h>
 
-#include "poly.h"
-#include "polyvec.h"
+#include "../../../params.h"
 #include "arith_native_x86_64.h"
 
 #define MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
@@ -28,9 +27,9 @@
 #define MLKEM_USE_NATIVE_POLY_TOBYTES
 #define MLKEM_USE_NATIVE_POLY_FROMBYTES
 
-static INLINE void poly_permute_bitrev_to_custom(poly *data)
+static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
 {
-  nttunpack_avx2((__m256i *)(data->coeffs), qdata.vec);
+  nttunpack_avx2((__m256i *)(data), qdata.vec);
 }
 
 static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
@@ -45,27 +44,28 @@ static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
   return (int)rej_uniform_avx2(r, buf);
 }
 
-static INLINE void ntt_native(poly *data)
+static INLINE void ntt_native(int16_t data[MLKEM_N])
 {
   ntt_avx2((__m256i *)data, qdata.vec);
 }
 
-static INLINE void intt_native(poly *data)
+static INLINE void intt_native(int16_t data[MLKEM_N])
 {
   invntt_avx2((__m256i *)data, qdata.vec);
 }
 
-static INLINE void poly_reduce_native(poly *data)
+static INLINE void poly_reduce_native(int16_t data[MLKEM_N])
 {
-  reduce_avx2((__m256i *)data->coeffs, qdata.vec);
+  reduce_avx2((__m256i *)data, qdata.vec);
 }
 
-static INLINE void poly_tomont_native(poly *data)
+static INLINE void poly_tomont_native(int16_t data[MLKEM_N])
 {
-  tomont_avx2((__m256i *)data->coeffs, qdata.vec);
+  tomont_avx2((__m256i *)data, qdata.vec);
 }
 
-static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y)
+static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
+                                                const int16_t y[MLKEM_N])
 {
   /* AVX2 backend does not use mulcache */
   ((void)y);
@@ -73,22 +73,23 @@ static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y)
 }
 
 static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache)
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)])
 {
   polyvec_basemul_acc_montgomery_cached_avx2(r, a, b, b_cache);
 }
 
 static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a)
+                                       const int16_t a[MLKEM_N])
 {
-  ntttobytes_avx2(r, (const __m256i *)a->coeffs, qdata.vec);
+  ntttobytes_avx2(r, (const __m256i *)a, qdata.vec);
 }
 
-static INLINE void poly_frombytes_native(poly *r,
+static INLINE void poly_frombytes_native(int16_t r[MLKEM_N],
                                          const uint8_t a[MLKEM_POLYBYTES])
 {
-  nttfrombytes_avx2((__m256i *)r->coeffs, a, qdata.vec);
+  nttfrombytes_avx2((__m256i *)r, a, qdata.vec);
 }
 
 #endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S
similarity index 98%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S
index 134bd4f71..3f013a5fa 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S
@@ -11,7 +11,7 @@
 //   in [0,1,...,q-1] rather than [0,1,...,q], matching the
 //   semantics of poly_reduce().
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 #include "consts.h"
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.inc b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.inc
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.inc
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.inc
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/intt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S
index 6b1d78ef2..7b1f22624 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/intt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S
@@ -9,7 +9,7 @@
  * Changes to placement of modular reductions have
  * been made to simplify reasoning of non-overflow */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/ntt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S
index e8bf7894b..5d928b4cc 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/ntt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S
@@ -6,7 +6,7 @@
 // Implementation from Kyber reference repository
 // https://github.com/pq-crystals/kyber/blob/main/avx2
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
 #include "consts.h"
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_avx2.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_avx2.c
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_avx2.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_avx2.c
index 54037a0df..adf2d338b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_avx2.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_avx2.c
@@ -8,7 +8,7 @@
  * https://github.com/pq-crystals/kyber/blob/main/avx2
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_table.c
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_table.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_table.c
index 9bbc47146..e95fd9e79 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_table.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_table.c
@@ -8,7 +8,7 @@
  *          Do not modify it directly.
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S
index 5e708748a..9bcd04896 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S
@@ -6,7 +6,7 @@
 // Implementation from Kyber reference repository
 // https://github.com/pq-crystals/kyber/blob/main/avx2
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.inc b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.inc
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.inc
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.inc
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/x86_64_zetas.i b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/x86_64_zetas.i
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/x86_64_zetas.i
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/x86_64_zetas.i
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.c
deleted file mode 100644
index 3651c8da9..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-
-#include <stdint.h>
-#include "arith_backend.h"
-#include "debug.h"
-#include "ntt.h"
-#include "reduce.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
-#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
-#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
-/* End of static namespacing */
-
-#if !defined(MLKEM_USE_NATIVE_NTT)
-/*
- * Computes a block CT butterflies with a fixed twiddle factor,
- * using Montgomery multiplication.
- * Parameters:
- * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
- * - root: Twiddle factor to use for the butterfly. This must be in
- *         Montgomery form and signed canonical.
- * - start: Offset to the beginning of the butterfly block
- * - len: Index difference between coefficients subject to a butterfly
- * - bound: Ghost variable describing coefficient bound: Prior to `start`,
- *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
- *          they must be bound by `bound`.
- * When this function returns, output coefficients in the index range
- * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
- * Example:
- * - start=8, len=4
- *   This would compute the following four butterflies
- *          8     --    12
- *             9    --     13
- *                10   --     14
- *                   11   --     15
- * - start=4, len=2
- *   This would compute the following two butterflies
- *          4 -- 6
- *             5 -- 7
- */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
-                                unsigned start, unsigned len, int bound)
-__contract__(
-  requires(start < MLKEM_N)
-  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
-  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
-  requires(-HALF_Q < zeta && zeta < HALF_Q)
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
-  requires(array_abs_bound(r, start, MLKEM_N, bound))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
-  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
-{
-  /* `bound` is a ghost variable only needed in the CBMC specification */
-  unsigned j;
-  ((void)bound);
-  for (j = start; j < start + len; j++)
-  __loop__(
-    invariant(start <= j && j <= start + len)
-    /*
-     * Coefficients are updated in strided pairs, so the bounds for the
-     * intermediate states alternate twice between the old and new bound
-     */
-    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j,           start + len, bound))
-    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
-  {
-    int16_t t;
-    t = fqmul(r[j + len], zeta);
-    r[j + len] = r[j] - t;
-    r[j] = r[j] + t;
-  }
-}
-
-/*
- *Compute one layer of forward NTT
- * Parameters:
- * - r: Pointer to base of polynomial
- * - len: Stride of butterflies in this layer.
- * - layer: Ghost variable indicating which layer is being applied.
- *          Must match `len` via `len == MLKEM_N >> layer`.
- * Note: `len` could be dropped and computed in the function, but
- *   we are following the structure of the reference NTT from the
- *   official Kyber implementation here, merely adding `layer` as
- *   a ghost variable for the specifications.
- */
-static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
-  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable only needed in the CBMC specification */
-  ((void)layer);
-  /* Twiddle factors for layer n start at index 2^(layer-1) */
-  k = MLKEM_N / (2 * len);
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(start < MLKEM_N + 2 * len)
-    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
-    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
-    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
-  {
-    int16_t zeta = zetas[k++];
-    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
-  }
-}
-
-/*
- * Compute full forward NTT
- * NOTE: This particular implementation satisfies a much tighter
- * bound on the output coefficients (5*q) than the contractual one (8*q),
- * but this is not needed in the calling code. Should we change the
- * base multiplication strategy to require smaller NTT output bounds,
- * the proof may need strengthening.
- */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  unsigned len, layer;
-  int16_t *r;
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  r = p->coeffs;
-
-  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
-  __loop__(
-    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
-    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
-  {
-    ntt_layer(r, len, layer);
-  }
-
-  /* Check the stronger bound */
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_NTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  ntt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_NTT */
-
-#if !defined(MLKEM_USE_NATIVE_INTT)
-
-/* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
-  requires(len == (1 << (8 - layer)))
-  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable used only in the specification */
-  ((void)layer);
-  k = MLKEM_N / len - 1;
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(start <= MLKEM_N && k <= 127)
-    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
-    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
-  {
-    unsigned j;
-    int16_t zeta = zetas[k--];
-    for (j = start; j < start + len; j++)
-    __loop__(
-      invariant(start <= j && j <= start + len)
-      invariant(start <= MLKEM_N && k <= 127)
-      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-    {
-      int16_t t = r[j];
-      r[j] = barrett_reduce(t + r[j + len]);
-      r[j + len] = r[j + len] - t;
-      r[j + len] = fqmul(r[j + len], zeta);
-    }
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  /*
-   * Scale input polynomial to account for Montgomery factor
-   * and NTT twist. This also brings coefficients down to
-   * absolute value < MLKEM_Q.
-   */
-  unsigned j, len, layer;
-  const int16_t f = 1441;
-  int16_t *r = p->coeffs;
-
-  for (j = 0; j < MLKEM_N; j++)
-  __loop__(
-    invariant(j <= MLKEM_N)
-    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
-  {
-    r[j] = fqmul(r[j], f);
-  }
-
-  /* Run the invNTT layers */
-  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
-  __loop__(
-    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-  {
-    invntt_layer(p->coeffs, len, layer);
-  }
-
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  intt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-{
-  int32_t t0, t1;
-  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
-
-  t0 = (int32_t)a[1] * b_cached;
-  t0 += (int32_t)a[0] * b[0];
-  t1 = (int32_t)a[0] * b[1];
-  t1 += (int32_t)a[1] * b[0];
-
-  /* |ti| < 2 * q * 2^15 */
-  r[0] = montgomery_reduce(t0);
-  r[1] = montgomery_reduce(t1);
-
-  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
-}
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
-int empty_cu_ntt;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.h
deleted file mode 100644
index 4e80d3ab3..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/ntt.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef NTT_H
-#define NTT_H
-#include "common.h"
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "poly.h"
-#include "reduce.h"
-
-#define zetas MLKEM_NAMESPACE(zetas)
-extern const int16_t zetas[128];
-
-#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
-/*************************************************
- * Name:        poly_ntt
- *
- * Description: Computes negacyclic number-theoretic transform (NTT) of
- *              a polynomial in place.
- *
- *              The input is assumed to be in normal order and
- *              coefficient-wise bound by MLKEM_Q in absolute value.
- *
- *              The output polynomial is in bitreversed order, and
- *              coefficient-wise bound by NTT_BOUND in absolute value.
- *
- *              (NOTE: Sometimes the input to the NTT is actually smaller,
- *               which gives better bounds.)
- *
- * Arguments:   - poly *p: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
-);
-
-#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
-/*************************************************
- * Name:        poly_invntt_tomont
- *
- * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
- *              of a polynomial in place;
- *              inputs assumed to be in bitreversed order, output in normal
- *              order
- *
- *              The input is assumed to be in bitreversed order, and can
- *              have arbitrary coefficients in int16_t.
- *
- *              The output polynomial is in normal order, and
- *              coefficient-wise bound by INVNTT_BOUND in absolute value.
- *
- * Arguments:   - uint16_t *a: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
-);
-
-#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
-/************************************************************
- * Name: basemul_cached
- *
- * Description: Computes a representative modulo q of
- *              (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536
- *
- *              If b_cached is b1*zeta, this represents the
- *              product of (a0 + a1*X) and (b0 + b1*X) in
- *              Fq[X]/(X^2 - zeta).
- *
- * Arguments: - r: Pointer to output polynomial
- *                   Upon return, coefficients are bound by
- *                   2*MLKEM_Q in absolute value.
- *            - a: Pointer to first input polynomial
- *                   Every coefficient must be in [0..4095]
- *            - b: Pointer to second input polynomial
- *                   Can have arbitrary int16_t coefficients
- *            - b_cached: Some precomputed value, typically derived from
- *                   b1 and a twiddle factor. Can be an arbitary int16_t.
- ************************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-__contract__(
-  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
-  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
-  assigns(memory_slice(r, 2 * sizeof(int16_t)))
-  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
-);
-
-#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/params.h
index 57ea4c8ba..7f6c12625 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/params.h
@@ -18,6 +18,7 @@
 #define MLKEM_N 256
 #define MLKEM_Q 3329
 #define UINT12_LIMIT 4096
+#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
 
 #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */
 #define MLKEM_SSBYTES 32  /* size in bytes of shared key */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.c
index 7483ebf6d..e8a2e2c6e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.c
@@ -8,388 +8,246 @@
 #include <stdint.h>
 #include <string.h>
 #include "arith_backend.h"
-#include "cbd.h"
 #include "cbmc.h"
 #include "debug.h"
 #include "fips202x4.h"
-#include "ntt.h"
 #include "poly.h"
-#include "reduce.h"
+#include "sampling.h"
 #include "symmetric.h"
 #include "verify.h"
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j <= MLKEM_N / 4))
-  {
-    unsigned k;
-    uint16_t t[4];
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(forall(r, 0, k, t[r] < (1u << 10))))
-    {
-      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
-    }
-
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 10-bit in size.
-     */
-    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
-    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
-    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
-    r[5 * j + 4] = (t[3] >> 2);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
-  {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
+#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
+#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
+#define fqmul MLKEM_NAMESPACE(fqmul)
+#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
+#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
+#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
+#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
+#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
+#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        cast_uint16_to_int16
+ *
+ * Description: Cast uint16 value to int16
+ *
+ * Returns:
+ *   input x in     0 .. 32767: returns value unchanged
+ *   input x in 32768 .. 65535: returns (x - 65536)
+ **************************************************/
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+ALWAYS_INLINE
+static INLINE int16_t cast_uint16_to_int16(uint16_t x)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 4)
-    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[4];
-    uint8_t const *base = &a[5 * j];
-
-    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
-    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
-    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
-
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  /*
+   * PORTABILITY: This relies on uint16_t -> int16_t
+   * being implemented as the inverse of int16_t -> uint16_t,
+   * which is implementation-defined (C99 6.3.1.3 (3))
+   * CBMC (correctly) fails to prove this conversion is OK,
+   * so we have to suppress that check here
+   */
+  return (int16_t)x;
 }
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+/*************************************************
+ * Name:        montgomery_reduce_generic
+ *
+ * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
+ *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
+ *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
+ *
+ **************************************************/
+ALWAYS_INLINE
+static INLINE int16_t montgomery_reduce_generic(int32_t a)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
+  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 32)))
-    {
-      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
-    }
+  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
+  const uint16_t a_reduced = a & UINT16_MAX;
+  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
 
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC, and use array indexing into
-     * r rather than pointer-arithmetic to simplify verification
-     */
-    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
-    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
-    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
-    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
-    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
-  }
-}
+  /* Lift to signed canonical representative mod 2^16. */
+  const int16_t t = cast_uint16_to_int16(a_inverted);
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  int32_t r = a - ((int32_t)t * MLKEM_Q);
+  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
 
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j <= MLKEM_N / 8))
-  {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
-    {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
-    }
+  /*
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  r = r >> 16;
+  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
+   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
+   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *
+   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
+   */
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
-  }
+  return (int16_t)r;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+/*************************************************
+ * Name:        montgomery_reduce
+ *
+ * Description: Montgomery reduction
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q,
+ *              smaller than 2 * q in absolute value.
+ **************************************************/
+static INLINE int16_t montgomery_reduce(int32_t a)
+__contract__(
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
+  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
+)
 {
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    uint8_t t[8];
-    const unsigned offset = i * 5;
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC and unwind loop for ease
-     * of proof.
-     */
-
-    /*
-     * Decompress 5 8-bit bytes (so 40 bits) into
-     * 8 5-bit values stored in t[]
-     */
-    t[0] = 0x1F & (a[offset + 0] >> 0);
-    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
-    t[2] = 0x1F & (a[offset + 1] >> 2);
-    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
-    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
-    t[5] = 0x1F & (a[offset + 3] >> 1);
-    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
-    t[7] = 0x1F & (a[offset + 4] >> 3);
-
-    /* and copy to the correct slice in r[] */
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(j <= 8 && i <= MLKEM_N / 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  int16_t res;
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
+
+  res = montgomery_reduce_generic(a);
+  /* Bounds:
+   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
+   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
+   *        < 2 * MLKEM_Q */
+
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
+  return res;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+    !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        fqmul
+ *
+ * Description: Montgomery multiplication modulo q=3329
+ *
+ * Arguments:   - int16_t a: first factor
+ *                  Can be any int16_t.
+ *              - int16_t b: second factor.
+ *                  Must be signed canonical (abs value <(q+1)/2)
+ *
+ * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
+ * smaller than q in absolute value.
+ *
+ **************************************************/
+static INLINE int16_t fqmul(int16_t a, int16_t b)
+__contract__(
+  requires(b > -HALF_Q)
+  requires(b < HALF_Q)
+  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
+)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
-  }
+  int16_t res;
+  debug_assert_abs_bound(&b, 1, HALF_Q);
+
+  res = montgomery_reduce((int32_t)a * (int32_t)b);
+  /* Bounds:
+   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
+   *        < MLKEM_Q
+   */
 
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
+  return res;
 }
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
-
-#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+          !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+          !defined(MLKEM_USE_NATIVE_NTT) ||                   \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        barrett_reduce
+ *
+ * Description: Barrett reduction; given a 16-bit integer a, computes
+ *              centered representative congruent to a mod q in
+ *              {-(q-1)/2,...,(q-1)/2}
+ *
+ * Arguments:   - int16_t a: input integer to be reduced
+ *
+ * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
+ **************************************************/
+static INLINE int16_t barrett_reduce(int16_t a)
+__contract__(
+  ensures(return_value > -HALF_Q && return_value < HALF_Q)
+)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i <= MLKEM_N / 2))
-  {
-    const uint16_t t0 = a->coeffs[2 * i];
-    const uint16_t t1 = a->coeffs[2 * i + 1];
-    /*
-     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
-     * significant data, so these can be packed into 24 bits or exactly
-     * 3 bytes, as follows.
-     */
-
-    /* Least significant bits 0 - 7 of t0. */
-    r[3 * i + 0] = t0 & 0xFF;
-
-    /*
-     * Most significant bits 8 - 11 of t0 become the least significant
-     * nibble of the second byte. The least significant 4 bits
-     * of t1 become the upper nibble of the second byte.
-     */
-    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+  /*
+   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
+   * multiplier is round_to_nearest(2**26/MLKEM_Q)
+   */
+  const int BPOWER = 26;
+  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
 
-    /* Bits 4 - 11 of t1 become the third byte. */
-    r[3 * i + 2] = t1 >> 4;
-  }
-}
-#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-{
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  poly_tobytes_native(r, a);
-}
-#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+  /*
+   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
+   * above and shift by BPOWER places.
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
 
-#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
-  {
-    const uint8_t t0 = a[3 * i + 0];
-    const uint8_t t1 = a[3 * i + 1];
-    const uint8_t t2 = a[3 * i + 2];
-    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
-    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
-  }
+  /*
+   * t is in -10 .. +10, so we need 32-bit math to
+   * evaluate t * MLKEM_Q and the subsequent subtraction
+   */
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
 
-  /* Note that the coefficients are not canonical */
-  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
-}
-#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  poly_frombytes_native(r, a);
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
-#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
+                           int16_t b_cached)
+__contract__(
+  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
+  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
+  assigns(memory_slice(r, 2 * sizeof(int16_t)))
+  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)))
 {
-  unsigned i;
-#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
-#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
-#endif
+  int32_t t0, t1;
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <  MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      /* Prevent the compiler from recognizing this as a bit selection */
-      uint8_t mask = value_barrier_u8(1u << j);
-      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
-    }
-  }
-  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
-}
+  t0 = (int32_t)a[1] * b_cached;
+  t0 += (int32_t)a[0] * b[0];
+  t1 = (int32_t)a[0] * b[1];
+  t1 += (int32_t)a[1] * b[0];
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* |ti| < 2 * q * 2^15 */
+  r[0] = montgomery_reduce(t0);
+  r[1] = montgomery_reduce(t1);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    msg[i] = 0;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8))
-    {
-      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
-      msg[i] |= t << j;
-    }
-  }
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -434,12 +292,46 @@ void poly_tomont(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
-  poly_tomont_native(r);
+  poly_tomont_native(r->coeffs);
   debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
 #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/************************************************************
+ * Name: scalar_signed_to_unsigned_q
+ *
+ * Description: converts signed polynomial coefficient
+ *              from signed (-3328 .. 3328) form to
+ *              unsigned form (0 .. 3328).
+ *
+ * Note: Cryptographic constant time implementation
+ *
+ * Examples:       0 -> 0
+ *                 1 -> 1
+ *              3328 -> 3328
+ *                -1 -> 3328
+ *                -2 -> 3327
+ *             -3328 -> 1
+ *
+ * Arguments: c: signed coefficient to be converted
+ ************************************************************/
+static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
+__contract__(
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
+  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
+{
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
+  /* Add Q if c is negative, but in constant time */
+  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
+
+  /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
+  return (uint16_t)c;
+}
+
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
@@ -461,7 +353,7 @@ void poly_reduce(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
-  poly_reduce_native(r);
+  poly_reduce_native(r->coeffs);
   debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
@@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
-  poly_mulcache_compute_native(x, a);
+  poly_mulcache_compute_native(x->coeffs, a->coeffs);
   /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 
+#if !defined(MLKEM_USE_NATIVE_NTT)
+/*
+ * Computes a block CT butterflies with a fixed twiddle factor,
+ * using Montgomery multiplication.
+ * Parameters:
+ * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
+ * - root: Twiddle factor to use for the butterfly. This must be in
+ *         Montgomery form and signed canonical.
+ * - start: Offset to the beginning of the butterfly block
+ * - len: Index difference between coefficients subject to a butterfly
+ * - bound: Ghost variable describing coefficient bound: Prior to `start`,
+ *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
+ *          they must be bound by `bound`.
+ * When this function returns, output coefficients in the index range
+ * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
+ * Example:
+ * - start=8, len=4
+ *   This would compute the following four butterflies
+ *          8     --    12
+ *             9    --     13
+ *                10   --     14
+ *                   11   --     15
+ * - start=4, len=2
+ *   This would compute the following two butterflies
+ *          4 -- 6
+ *             5 -- 7
+ */
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
+__contract__(
+  requires(start < MLKEM_N)
+  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
+  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
+  requires(-HALF_Q < zeta && zeta < HALF_Q)
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
+  requires(array_abs_bound(r, start, MLKEM_N, bound))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
+  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
+{
+  /* `bound` is a ghost variable only needed in the CBMC specification */
+  unsigned j;
+  ((void)bound);
+  for (j = start; j < start + len; j++)
+  __loop__(
+    invariant(start <= j && j <= start + len)
+    /*
+     * Coefficients are updated in strided pairs, so the bounds for the
+     * intermediate states alternate twice between the old and new bound
+     */
+    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j,           start + len, bound))
+    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
+  {
+    int16_t t;
+    t = fqmul(r[j + len], zeta);
+    r[j + len] = r[j] - t;
+    r[j] = r[j] + t;
+  }
+}
+
+/*
+ *Compute one layer of forward NTT
+ * Parameters:
+ * - r: Pointer to base of polynomial
+ * - len: Stride of butterflies in this layer.
+ * - layer: Ghost variable indicating which layer is being applied.
+ *          Must match `len` via `len == MLKEM_N >> layer`.
+ * Note: `len` could be dropped and computed in the function, but
+ *   we are following the structure of the reference NTT from the
+ *   official Kyber implementation here, merely adding `layer` as
+ *   a ghost variable for the specifications.
+ */
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
+  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable only needed in the CBMC specification */
+  ((void)layer);
+  /* Twiddle factors for layer n start at index 2^(layer-1) */
+  k = MLKEM_N / (2 * len);
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
+    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
+  {
+    int16_t zeta = zetas[k++];
+    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
+  }
+}
+
+/*
+ * Compute full forward NTT
+ * NOTE: This particular implementation satisfies a much tighter
+ * bound on the output coefficients (5*q) than the contractual one (8*q),
+ * but this is not needed in the calling code. Should we change the
+ * base multiplication strategy to require smaller NTT output bounds,
+ * the proof may need strengthening.
+ */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  unsigned len, layer;
+  int16_t *r;
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  r = p->coeffs;
+
+  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
+  __loop__(
+    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
+    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
+  {
+    ntt_layer(r, len, layer);
+  }
+
+  /* Check the stronger bound */
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_NTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  ntt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if !defined(MLKEM_USE_NATIVE_INTT)
+
+/* Compute one layer of inverse NTT */
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
+  requires(len == (1 << (8 - layer)))
+  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable used only in the specification */
+  ((void)layer);
+  k = MLKEM_N / len - 1;
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+    invariant(start <= MLKEM_N && k <= 127)
+    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
+    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
+  {
+    unsigned j;
+    int16_t zeta = zetas[k--];
+    for (j = start; j < start + len; j++)
+    __loop__(
+      invariant(start <= j && j <= start + len)
+      invariant(start <= MLKEM_N && k <= 127)
+      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+    {
+      int16_t t = r[j];
+      r[j] = barrett_reduce(t + r[j + len]);
+      r[j + len] = r[j + len] - t;
+      r[j + len] = fqmul(r[j + len], zeta);
+    }
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  /*
+   * Scale input polynomial to account for Montgomery factor
+   * and NTT twist. This also brings coefficients down to
+   * absolute value < MLKEM_Q.
+   */
+  unsigned j, len, layer;
+  const int16_t f = 1441;
+  int16_t *r = p->coeffs;
+
+  for (j = 0; j < MLKEM_N; j++)
+  __loop__(
+    invariant(j <= MLKEM_N)
+    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
+  {
+    r[j] = fqmul(r[j], f);
+  }
+
+  /* Run the invNTT layers */
+  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
+  __loop__(
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+  {
+    invntt_layer(p->coeffs, len, layer);
+  }
+
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_INTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  intt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_INTT */
+
 #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.h
index 6a14c785d..cb0d67c1a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly.h
@@ -9,7 +9,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "reduce.h"
+#include "debug.h"
 #include "verify.h"
 
 /* Absolute exclusive upper bound for the output of the inverse NTT */
@@ -18,6 +18,9 @@
 /* Absolute exclusive upper bound for the output of the forward NTT */
 #define NTT_BOUND (8 * MLKEM_Q)
 
+#define zetas MLKEM_NAMESPACE(zetas)
+extern const int16_t zetas[128];
+
 /*
  * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
  * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
@@ -38,520 +41,6 @@ typedef struct
   int16_t coeffs[MLKEM_N >> 1];
 } poly_mulcache;
 
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
-#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
-#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
-#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
-#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
-#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
-#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
-#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
-#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
-#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
-/* End of static namespacing */
-
-/************************************************************
- * Name: scalar_compress_d1
- *
- * Description: Computes round(u * 2 / q)
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 1.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d1(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 2)
-  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
-{
-  uint32_t d0 = u << 1;
-  d0 *= 645083;
-  d0 += 1u << 30;
-  d0 >>= 31;
-  return d0;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_compress_d4
- *
- * Description: Computes round(u * 16 / q) % 16
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d4(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 16)
-  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
-{
-  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
-  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d4
- *
- * Description: Computes round(u * q / 16)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d4(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 16)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 8) / 16; }
-
-/************************************************************
- * Name: scalar_compress_d5
- *
- * Description: Computes round(u * 32 / q) % 32
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d5(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 32)
-  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
-{
-  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
-  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d5
- *
- * Description: Computes round(u * q / 32)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 32
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d5(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 32)
-  ensures(return_value <= MLKEM_Q - 1)
-) { return ((u * MLKEM_Q) + 16) / 32; }
-
-/************************************************************
- * Name: scalar_compress_d10
- *
- * Description: Computes round(u * 2**10 / q) % 2**10
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d10(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 10))
-  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
-{
-  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x3FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d10
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d10(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 1024)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 512) / 1024; }
-
-/************************************************************
- * Name: scalar_compress_d11
- *
- * Description: Computes round(u * 2**11 / q) % 2**11
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 11.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d11(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 11))
-  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
-{
-  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x7FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d11
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d11(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 2048)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 1024) / 2048; }
-
-/************************************************************
- * Name: scalar_signed_to_unsigned_q
- *
- * Description: converts signed polynomial coefficient
- *              from signed (-3328 .. 3328) form to
- *              unsigned form (0 .. 3328).
- *
- * Note: Cryptographic constant time implementation
- *
- * Examples:       0 -> 0
- *                 1 -> 1
- *              3328 -> 3328
- *                -1 -> 3328
- *                -2 -> 3327
- *             -3328 -> 1
- *
- * Arguments: c: signed coefficient to be converted
- ************************************************************/
-static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
-__contract__(
-  requires(c > -MLKEM_Q && c < MLKEM_Q)
-  ensures(return_value >= 0 && return_value < MLKEM_Q)
-  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
-{
-  debug_assert_abs_bound(&c, 1, MLKEM_Q);
-
-  /* Add Q if c is negative, but in constant time */
-  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
-
-  /* and therefore cast to uint16_t is safe. */
-  debug_assert_bound(&c, 1, 0, MLKEM_Q);
-  return (uint16_t)c;
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
-    (MLKEM_K == 2 || MLKEM_K == 3)
-#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
-/*************************************************
- * Name:        poly_compress_d4
- *
- * Description: Compression (4 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
-
-#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
-/*************************************************
- * Name:        poly_compress_d10
- *
- * Description: Compression (10 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
-
-#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
-/*************************************************
- * Name:        poly_decompress_d4
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
-
-#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
-/*************************************************
- * Name:        poly_decompress_d10
- *
- * Description: De-serialization and subsequent decompression (10 bits) of a
- *              polynomial; approximate inverse of poly_compress_d10
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
-/*************************************************
- * Name:        poly_compress_d5
- *
- * Description: Compression (5 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
-
-#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
-/*************************************************
- * Name:        poly_compress_d11
- *
- * Description: Compression (11 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
-
-#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
-/*************************************************
- * Name:        poly_decompress_d5
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
-
-#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
-/*************************************************
- * Name:        poly_decompress_d11
- *
- * Description: De-serialization and subsequent decompression (11 bits) of a
- *              polynomial; approximate inverse of poly_compress_d11
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
-        */
-
-#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
-/*************************************************
- * Name:        poly_tobytes
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - a: const pointer to input polynomial,
- *                with each coefficient in the range [0,1,..,Q-1]
- *              OUTPUT
- *              - r: pointer to output byte array
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYBYTES))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
-
-
-#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
-/*************************************************
- * Name:        poly_frombytes
- *
- * Description: De-serialization of a polynomial.
- *
- * Arguments:   INPUT
- *              - a: pointer to input byte array
- *                   (of MLKEM_POLYBYTES bytes)
- *              OUTPUT
- *              - r: pointer to output polynomial, with
- *                   each coefficient unsigned and in the range
- *                   0 .. 4095
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
-);
-
-
-#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
-/*************************************************
- * Name:        poly_frommsg
- *
- * Description: Convert 32-byte message to polynomial
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *msg: pointer to input message
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
-
-#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
-/*************************************************
- * Name:        poly_tomsg
- *
- * Description: Convert polynomial to 32-byte message
- *
- * Arguments:   - uint8_t *msg: pointer to output message
- *              - const poly *r: pointer to input polynomial
- *                Coefficients must be unsigned canonical
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(msg))
-);
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -715,4 +204,56 @@ __contract__(
   assigns(object_whole(r))
 );
 
+#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
+/*************************************************
+ * Name:        poly_ntt
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ *              (NOTE: Sometimes the input to the NTT is actually smaller,
+ *               which gives better bounds.)
+ *
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
+);
+
+#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
+/*************************************************
+ * Name:        poly_invntt_tomont
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place;
+ *              inputs assumed to be in bitreversed order, output in normal
+ *              order
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
+);
+
 #endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly_k.c
similarity index 97%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly_k.c
index 50ea1c34a..c2d330ea9 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/polyvec.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly_k.c
@@ -2,13 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include "polyvec.h"
+#include "poly_k.h"
 #include <stdint.h>
 #include <string.h>
 #include "arith_backend.h"
-#include "cbd.h"
-#include "ntt.h"
-#include "poly.h"
+#include "compress.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 #include "debug.h"
@@ -131,7 +130,9 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
   /* Omitting bounds assertion for cache since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
-  polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
+  polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a,
+                                               (const int16_t *)b,
+                                               (const int16_t *)b_cache);
 }
 #endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly_k.h
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly_k.h
index 8be8579e0..0aea95912 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/polyvec.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/poly_k.h
@@ -2,11 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#ifndef POLYVEC_H
-#define POLYVEC_H
+#ifndef POLY_K_H
+#define POLY_K_H
 
 #include <stdint.h>
 #include "common.h"
+#include "compress.h"
 #include "poly.h"
 
 #define polyvec MLKEM_NAMESPACE_K(polyvec)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/reduce.h
deleted file mode 100644
index b432a4201..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/reduce.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "common.h"
-#include "debug.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
-#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
-#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
-#define fqmul MLKEM_NAMESPACE(fqmul)
-#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
-/* End of static namespacing */
-
-#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
-
-/*************************************************
- * Name:        cast_uint16_to_int16
- *
- * Description: Cast uint16 value to int16
- *
- * Returns:
- *   input x in     0 .. 32767: returns value unchanged
- *   input x in 32768 .. 65535: returns (x - 65536)
- **************************************************/
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
-ALWAYS_INLINE
-static INLINE int16_t cast_uint16_to_int16(uint16_t x)
-{
-  /*
-   * PORTABILITY: This relies on uint16_t -> int16_t
-   * being implemented as the inverse of int16_t -> uint16_t,
-   * which is implementation-defined (C99 6.3.1.3 (3))
-   * CBMC (correctly) fails to prove this conversion is OK,
-   * so we have to suppress that check here
-   */
-  return (int16_t)x;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/*************************************************
- * Name:        montgomery_reduce_generic
- *
- * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
- *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *
- * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
- *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
- *
- **************************************************/
-ALWAYS_INLINE
-static INLINE int16_t montgomery_reduce_generic(int32_t a)
-{
-  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
-  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
-
-  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
-  const uint16_t a_reduced = a & UINT16_MAX;
-  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
-
-  /* Lift to signed canonical representative mod 2^16. */
-  const int16_t t = cast_uint16_to_int16(a_inverted);
-
-  int32_t r = a - ((int32_t)t * MLKEM_Q);
-  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
-
-  /*
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  r = r >> 16;
-  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
-   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
-   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *
-   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
-   */
-
-  return (int16_t)r;
-}
-
-/*************************************************
- * Name:        montgomery_reduce
- *
- * Description: Montgomery reduction
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
- *
- * Returns:     integer congruent to a * R^-1 modulo q,
- *              smaller than 2 * q in absolute value.
- **************************************************/
-static INLINE int16_t montgomery_reduce(int32_t a)
-__contract__(
-  requires(a > -(2 * UINT12_LIMIT * 32768))
-  requires(a <  (2 * UINT12_LIMIT * 32768))
-  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
-
-  res = montgomery_reduce_generic(a);
-  /* Bounds:
-   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
-   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
-   *        < 2 * MLKEM_Q */
-
-  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        fqmul
- *
- * Description: Montgomery multiplication modulo q=3329
- *
- * Arguments:   - int16_t a: first factor
- *                  Can be any int16_t.
- *              - int16_t b: second factor.
- *                  Must be signed canonical (abs value <(q+1)/2)
- *
- * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
- * smaller than q in absolute value.
- *
- **************************************************/
-static INLINE int16_t fqmul(int16_t a, int16_t b)
-__contract__(
-  requires(b > -HALF_Q)
-  requires(b < HALF_Q)
-  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&b, 1, HALF_Q);
-
-  res = montgomery_reduce((int32_t)a * (int32_t)b);
-  /* Bounds:
-   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
-   *        < MLKEM_Q
-   */
-
-  debug_assert_abs_bound(&res, 1, MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        barrett_reduce
- *
- * Description: Barrett reduction; given a 16-bit integer a, computes
- *              centered representative congruent to a mod q in
- *              {-(q-1)/2,...,(q-1)/2}
- *
- * Arguments:   - int16_t a: input integer to be reduced
- *
- * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
- **************************************************/
-static INLINE int16_t barrett_reduce(int16_t a)
-__contract__(
-  ensures(return_value > -HALF_Q && return_value < HALF_Q)
-)
-{
-  /*
-   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
-   * multiplier is round_to_nearest(2**26/MLKEM_Q)
-   */
-  const int BPOWER = 26;
-  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
-
-  /*
-   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
-   * above and shift by BPOWER places.
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
-
-  /*
-   * t is in -10 .. +10, so we need 32-bit math to
-   * evaluate t * MLKEM_Q and the subsequent subtraction
-   */
-  int16_t res = (int16_t)(a - t * MLKEM_Q);
-
-  debug_assert_abs_bound(&res, 1, HALF_Q);
-  return res;
-}
-
-#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/sampling.c
similarity index 73%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/sampling.c
index cbbe4407f..98cbdcb74 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/sampling.c
@@ -9,7 +9,7 @@
 #include "debug.h"
 #include "fips202.h"
 #include "fips202x4.h"
-#include "rej_uniform.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 /* Static namespacing
@@ -18,6 +18,8 @@
  * within a single compilation unit. */
 #define rej_uniform MLKEM_NAMESPACE(rej_uniform)
 #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
 /* End of static namespacing */
 
 static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
@@ -233,9 +235,113 @@ void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
   xof_release(&state);
 }
 
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        load32_littleendian
+ *
+ * Description: load 4 bytes into a 32-bit integer
+ *              in little-endian order
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x
+ **************************************************/
+static uint32_t load32_littleendian(const uint8_t x[4])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  r |= (uint32_t)x[3] << 24;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
+  {
+    unsigned j;
+    uint32_t t = load32_littleendian(buf + 4 * i);
+    uint32_t d = t & 0x55555555;
+    d += (t >> 1) & 0x55555555;
+
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
+    {
+      const int16_t a = (d >> (4 * j + 0)) & 0x3;
+      const int16_t b = (d >> (4 * j + 2)) & 0x3;
+      r->coeffs[8 * i + j] = a - b;
+    }
+  }
+}
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+/*************************************************
+ * Name:        load24_littleendian
+ *
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+ **************************************************/
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
+  {
+    unsigned j;
+    const uint32_t t = load24_littleendian(buf + 3 * i);
+    uint32_t d = t & 0x00249249;
+    d += (t >> 1) & 0x00249249;
+    d += (t >> 2) & 0x00249249;
+
+    for (j = 0; j < 4; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 4 && j <= 4)
+      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
+    {
+      const int16_t a = (d >> (6 * j + 0)) & 0x7;
+      const int16_t b = (d >> (6 * j + 3)) & 0x7;
+      r->coeffs[4 * i + j] = a - b;
+    }
+  }
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
+
 #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
-int empty_cu_rej_uniform;
+#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling)
+int empty_cu_sampling;
 
 #endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/sampling.h
similarity index 63%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/sampling.h
index 801287259..cc524e0fc 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/rej_uniform.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/sampling.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#ifndef REJ_UNIFORM_H
-#define REJ_UNIFORM_H
+#ifndef SAMPLING_H
+#define SAMPLING_H
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -11,6 +11,37 @@
 #include "common.h"
 #include "poly.h"
 
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
+/*************************************************
+ * Name:        poly_cbd2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
+/*************************************************
+ * Name:        poly_cbd3
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
+
 #define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
 /*************************************************
  * Name:        poly_rej_uniform_x4
@@ -60,4 +91,4 @@ __contract__(
   assigns(memory_slice(entry, sizeof(poly)))
   ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
 
-#endif /* REJ_UNIFORM_H */
+#endif /* SAMPLING_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c
index 4ef887c62..987f0dce4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c
@@ -10,7 +10,7 @@
 
 #include "common.h"
 #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-#include "ntt.h"
+#include "poly.h"
 
 /*
  * Table of zeta values used in the reference NTT and inverse NTT.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/arith_backend.h
index 0543b1bd1..ade31cda1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/arith_backend.h
@@ -17,7 +17,7 @@
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
 #if defined(MLKEM_NATIVE_CHECK_APIS)
-#include "api.h"
+#include "native/api.h"
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.c
deleted file mode 100644
index 1e6b7c5d1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
-
-#include <stdint.h>
-#include "cbd.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
-#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-/* End of static namespacing */
-
-/*************************************************
- * Name:        load32_littleendian
- *
- * Description: load 4 bytes into a 32-bit integer
- *              in little-endian order
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x
- **************************************************/
-static uint32_t load32_littleendian(const uint8_t x[4])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  r |= (uint32_t)x[3] << 24;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
-  {
-    unsigned j;
-    uint32_t t = load32_littleendian(buf + 4 * i);
-    uint32_t d = t & 0x55555555;
-    d += (t >> 1) & 0x55555555;
-
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
-    {
-      const int16_t a = (d >> (4 * j + 0)) & 0x3;
-      const int16_t b = (d >> (4 * j + 2)) & 0x3;
-      r->coeffs[8 * i + j] = a - b;
-    }
-  }
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 4)
-    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
-  {
-    unsigned j;
-    const uint32_t t = load24_littleendian(buf + 3 * i);
-    uint32_t d = t & 0x00249249;
-    d += (t >> 1) & 0x00249249;
-    d += (t >> 2) & 0x00249249;
-
-    for (j = 0; j < 4; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 4 && j <= 4)
-      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
-    {
-      const int16_t a = (d >> (6 * j + 0)) & 0x7;
-      const int16_t b = (d >> (6 * j + 3)) & 0x7;
-      r->coeffs[4 * i + j] = a - b;
-    }
-  }
-}
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
-          3 */
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
-int empty_cu_cbd;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.h
deleted file mode 100644
index 54c1f5b90..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/cbd.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef CBD_H
-#define CBD_H
-
-#include <stdint.h>
-#include "common.h"
-#include "poly.h"
-
-#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
-/*************************************************
- * Name:        poly_cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
-/*************************************************
- * Name:        poly_cbd3
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
-
-#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h
index 4f326333e..62ed53ab1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h
@@ -15,12 +15,19 @@
 #include "sys.h"
 
 /* Include backend metadata */
-#if defined(MLKEM_USE_NATIVE)
-#if defined(MLKEM_NATIVE_ARITH_BACKEND)
-#include MLKEM_NATIVE_ARITH_BACKEND
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#include MLKEM_NATIVE_ARITH_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not.
+#endif
 #endif
-#if defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#include MLKEM_NATIVE_FIPS202_BACKEND
+
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#include MLKEM_NATIVE_FIPS202_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not.
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/compress.c
new file mode 100644
index 000000000..a03fe0ac4
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/compress.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
+#include <stdint.h>
+#include <string.h>
+#include "arith_backend.h"
+#include "cbmc.h"
+#include "compress.h"
+#include "debug.h"
+#include "verify.h"
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
+    {
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
+    }
+
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(invariant(j <= MLKEM_N / 4))
+  {
+    unsigned k;
+    uint16_t t[4];
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(forall(r, 0, k, t[r] < (1u << 10))))
+    {
+      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 10-bit in size.
+     */
+    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
+    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
+    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
+    r[5 * j + 4] = (t[3] >> 2);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  {
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 4)
+    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[4];
+    uint8_t const *base = &a[5 * j];
+
+    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
+    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
+    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
+
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 32)))
+    {
+      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
+    }
+
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC, and use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     */
+    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
+    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
+    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
+  {
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    uint8_t t[8];
+    const unsigned offset = i * 5;
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC and unwind loop for ease
+     * of proof.
+     */
+
+    /*
+     * Decompress 5 8-bit bytes (so 40 bits) into
+     * 8 5-bit values stored in t[]
+     */
+    t[0] = 0x1F & (a[offset + 0] >> 0);
+    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
+    t[2] = 0x1F & (a[offset + 1] >> 2);
+    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
+    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
+    t[5] = 0x1F & (a[offset + 3] >> 1);
+    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
+    t[7] = 0x1F & (a[offset + 4] >> 3);
+
+    /* and copy to the correct slice in r[] */
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(j <= 8 && i <= MLKEM_N / 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(invariant(i <= MLKEM_N / 2))
+  {
+    const uint16_t t0 = a->coeffs[2 * i];
+    const uint16_t t1 = a->coeffs[2 * i + 1];
+    /*
+     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
+     * significant data, so these can be packed into 24 bits or exactly
+     * 3 bytes, as follows.
+     */
+
+    /* Least significant bits 0 - 7 of t0. */
+    r[3 * i + 0] = t0 & 0xFF;
+
+    /*
+     * Most significant bits 8 - 11 of t0 become the least significant
+     * nibble of the second byte. The least significant 4 bits
+     * of t1 become the upper nibble of the second byte.
+     */
+    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+
+    /* Bits 4 - 11 of t1 become the third byte. */
+    r[3 * i + 2] = t1 >> 4;
+  }
+}
+#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  poly_tobytes_native(r, a->coeffs);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
+  {
+    const uint8_t t0 = a[3 * i + 0];
+    const uint8_t t1 = a[3 * i + 1];
+    const uint8_t t2 = a[3 * i + 2];
+    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
+    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+  }
+
+  /* Note that the coefficients are not canonical */
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  poly_frombytes_native(r->coeffs, a);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+{
+  unsigned i;
+#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
+#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
+#endif
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <  MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      /* Prevent the compiler from recognizing this as a bit selection */
+      uint8_t mask = value_barrier_u8(1u << j);
+      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
+    }
+  }
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    msg[i] = 0;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8))
+    {
+      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
+      msg[i] |= t << j;
+    }
+  }
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress)
+int empty_cu_compress;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/compress.h
new file mode 100644
index 000000000..409dbe519
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/compress.h
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef COMPRESS_H
+#define COMPRESS_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+#include "debug.h"
+#include "poly.h"
+#include "verify.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
+#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
+#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
+#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
+#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
+#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
+#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
+#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
+#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
+/* End of static namespacing */
+
+/************************************************************
+ * Name: scalar_compress_d1
+ *
+ * Description: Computes round(u * 2 / q)
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 1.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d1(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 2)
+  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
+{
+  uint32_t d0 = u << 1;
+  d0 *= 645083;
+  d0 += 1u << 30;
+  d0 >>= 31;
+  return d0;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_compress_d4
+ *
+ * Description: Computes round(u * 16 / q) % 16
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d4(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 16)
+  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
+{
+  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
+  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d4
+ *
+ * Description: Computes round(u * q / 16)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d4(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 16)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 8) / 16; }
+
+/************************************************************
+ * Name: scalar_compress_d5
+ *
+ * Description: Computes round(u * 32 / q) % 32
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d5(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 32)
+  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
+{
+  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
+  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d5
+ *
+ * Description: Computes round(u * q / 32)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 32
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d5(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 32)
+  ensures(return_value <= MLKEM_Q - 1)
+) { return ((u * MLKEM_Q) + 16) / 32; }
+
+/************************************************************
+ * Name: scalar_compress_d10
+ *
+ * Description: Computes round(u * 2**10 / q) % 2**10
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d10(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 10))
+  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
+{
+  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x3FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d10
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d10(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 1024)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 512) / 1024; }
+
+/************************************************************
+ * Name: scalar_compress_d11
+ *
+ * Description: Computes round(u * 2**11 / q) % 2**11
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 11.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d11(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 11))
+  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
+{
+  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x7FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d11
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d11(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 2048)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 1024) / 2048; }
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
+/*************************************************
+ * Name:        poly_compress_d4
+ *
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
+
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
+/*************************************************
+ * Name:        poly_decompress_d4
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
+/*************************************************
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
+
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
+/*************************************************
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
+
+#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
+/*************************************************
+ * Name:        poly_tobytes
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range [0,1,..,Q-1]
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYBYTES))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r))
+);
+
+
+#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
+/*************************************************
+ * Name:        poly_frombytes
+ *
+ * Description: De-serialization of a polynomial.
+ *
+ * Arguments:   INPUT
+ *              - a: pointer to input byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ *              OUTPUT
+ *              - r: pointer to output polynomial, with
+ *                   each coefficient unsigned and in the range
+ *                   0 .. 4095
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
+);
+
+
+#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
+/*************************************************
+ * Name:        poly_frommsg
+ *
+ * Description: Convert 32-byte message to polynomial
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *msg: pointer to input message
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
+#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
+/*************************************************
+ * Name:        poly_tomsg
+ *
+ * Description: Convert polynomial to 32-byte message
+ *
+ * Arguments:   - uint8_t *msg: pointer to output message
+ *              - const poly *r: pointer to input polynomial
+ *                Coefficients must be unsigned canonical
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(msg))
+);
+
+#endif /* COMPRESS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h
index fa89370ce..e975ede95 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h
@@ -122,46 +122,87 @@
 /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
- * Name:        MLKEM_USE_NATIVE
+ * Name:        MLKEM_USE_NATIVE_BACKEND_ARITH
  *
- * Description: Determines whether a native backend should
- *              be used, if available.
+ * Description: Determines whether an native arithmetic backend should be used.
+ *
+ *              The arithmetic backend covers performance critical functions
+ *              such as the number-theoretic transform (NTT).
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the arithmetic backend to be use is
+ *              determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(MLKEM_USE_NATIVE)
-/* #define MLKEM_USE_NATIVE */
+#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */
 #endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_ARITH_BACKEND
+ * Name:        MLKEM_NATIVE_ARITH_BACKEND_FILE
  *
  * Description: The arithmetic backend to use.
  *
- *              This must be the filename of an arithmetic backend.
- *              See the existing backends for examples.
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option
+ *              is ignored.
+ *
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must
+ *              either be undefined or the filename of an arithmetic backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND)
-#define MLKEM_NATIVE_ARITH_BACKEND "default.h"
-#endif /* MLKEM_NATIVE_ARITH_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \
+    !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h"
+#endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_FIPS202_BACKEND
+ * Name:        MLKEM_USE_NATIVE_BACKEND_FIPS202
+ *
+ * Description: Determines whether an native FIPS202 backend should be used.
+ *
+ *              The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is
+ *              the performance bottleneck of SHA3 and SHAKE.
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the FIPS202 backend to be use is
+ *              determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */
+#endif
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_FIPS202_BACKEND_FILE
  *
  * Description: The FIPS-202 backend to use.
  *
- *              This must be the filename of an FIPS-202 backend.
+ *              If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must
+ *              either be undefined or the filename of a FIPS202 backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h"
-#endif /* MLKEM_NATIVE_FIPS202_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \
+    !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h"
+#endif
 
 /*************************  Config internals  ********************************/
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.c
index 0cfcc3e9e..318d0fc77 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.c
@@ -9,11 +9,10 @@
 #include "fips202.h"
 #include "fips202x4.h"
 #include "indcpa.h"
-#include "ntt.h"
 #include "poly.h"
-#include "polyvec.h"
+#include "poly_k.h"
 #include "randombytes.h"
-#include "rej_uniform.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 #include "arith_backend.h"
@@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v,
 #define poly_permute_bitrev_to_custom \
   MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
-static INLINE void poly_permute_bitrev_to_custom(poly *data)
+static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
 __contract__(
   /* We don't specify that this should be a permutation, but only
    * that it does not change the bound established at the end of gen_matrix. */
-  requires(memory_no_alias(data, sizeof(poly)))
-  requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
+  requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
   assigns(memory_slice(data, sizeof(poly)))
-  ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
+  ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
 #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
 
 /* Not static for benchmarking */
@@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
   {
     for (j = 0; j < MLKEM_K; j++)
     {
-      poly_permute_bitrev_to_custom(&a[i].vec[j]);
+      poly_permute_bitrev_to_custom(a[i].vec[j].coeffs);
     }
   }
 }
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.h
index 2c4fda3c4..b4d5985bf 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/indcpa.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "polyvec.h"
+#include "poly_k.h"
 
 #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/README.md
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/README.md
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/README.md
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/clean.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/clean.h
similarity index 90%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/clean.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/clean.h
index 43a401dfc..f124702a4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/clean.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/clean.h
@@ -19,6 +19,6 @@
 /* Filename of the C backend implementation.
  * This is not inlined here because this header is included in assembly
  * files as well. */
-#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/clean_impl.h"
+#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/clean_impl.h"
 
 #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/opt.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/opt.h
similarity index 91%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/opt.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/opt.h
index 04323c3e7..a7217163f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/opt.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/opt.h
@@ -19,6 +19,6 @@
 /* Filename of the C backend implementation.
  * This is not inlined here because this header is included in assembly
  * files as well. */
-#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/opt_impl.h"
+#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/opt_impl.h"
 
 #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c
index 1e189fd99..b3a6f198f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c
@@ -8,7 +8,7 @@
  *          Do not modify it directly.
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
     defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/arith_native_aarch64.h
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/arith_native_aarch64.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/arith_native_aarch64.h
index fc4e7dd38..a784a3027 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/arith_native_aarch64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/arith_native_aarch64.h
@@ -6,7 +6,7 @@
 #define MLKEM_AARCH64_NATIVE_H
 
 #include <stdint.h>
-#include "common.h"
+#include "../../../common.h"
 
 #define aarch64_ntt_zetas_layer01234 \
   MLKEM_NAMESPACE(aarch64_ntt_zetas_layer01234)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/clean_impl.h
similarity index 58%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/clean_impl.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/clean_impl.h
index 548b1eebb..ded7d067a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/clean_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/clean_impl.h
@@ -12,9 +12,6 @@
 
 #include "arith_native_aarch64.h"
 
-#include "poly.h"
-#include "polyvec.h"
-
 /* Set of primitives that this backend replaces */
 #define MLKEM_USE_NATIVE_NTT
 #define MLKEM_USE_NATIVE_INTT
@@ -25,45 +22,46 @@
 #define MLKEM_USE_NATIVE_POLY_TOBYTES
 #define MLKEM_USE_NATIVE_REJ_UNIFORM
 
-static INLINE void ntt_native(poly *data)
+static INLINE void ntt_native(int16_t data[MLKEM_N])
 {
-  ntt_asm_clean(data->coeffs, aarch64_ntt_zetas_layer01234,
-                aarch64_ntt_zetas_layer56);
+  ntt_asm_clean(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56);
 }
 
-static INLINE void intt_native(poly *data)
+static INLINE void intt_native(int16_t data[MLKEM_N])
 {
-  intt_asm_clean(data->coeffs, aarch64_invntt_zetas_layer01234,
+  intt_asm_clean(data, aarch64_invntt_zetas_layer01234,
                  aarch64_invntt_zetas_layer56);
 }
 
-static INLINE void poly_reduce_native(poly *data)
+static INLINE void poly_reduce_native(int16_t data[MLKEM_N])
 {
-  poly_reduce_asm_clean(data->coeffs);
+  poly_reduce_asm_clean(data);
 }
-static INLINE void poly_tomont_native(poly *data)
+
+static INLINE void poly_tomont_native(int16_t data[MLKEM_N])
 {
-  poly_tomont_asm_clean(data->coeffs);
+  poly_tomont_asm_clean(data);
 }
 
-static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y)
+static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
+                                                const int16_t y[MLKEM_N])
 {
-  poly_mulcache_compute_asm_clean(x->coeffs, y->coeffs,
-                                  aarch64_zetas_mulcache_native,
+  poly_mulcache_compute_asm_clean(x, y, aarch64_zetas_mulcache_native,
                                   aarch64_zetas_mulcache_twisted_native);
 }
+
 static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache)
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)])
 {
-  polyvec_basemul_acc_montgomery_cached_asm_clean(
-      r->coeffs, a->vec[0].coeffs, b->vec[0].coeffs, b_cache->vec[0].coeffs);
+  polyvec_basemul_acc_montgomery_cached_asm_clean(r, a, b, b_cache);
 }
 
 static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a)
+                                       const int16_t a[MLKEM_N])
 {
-  poly_tobytes_asm_clean(r, a->coeffs);
+  poly_tobytes_asm_clean(r, a);
 }
 
 static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/consts.h
similarity index 94%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/consts.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/consts.h
index c40947299..e3ea26a27 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/consts.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/consts.h
@@ -7,7 +7,7 @@
 #define MLKEM_NATIVE_AARCH64_CONSTS
 
 #include <stdint.h>
-#include "common.h"
+#include "../../../common.h"
 
 #define zetas_mulcache_native MLKEM_NAMESPACE(zetas_mulcache_native)
 extern const int16_t zetas_mulcache_native[256];
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S
index b243a569d..28ad38975 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S
@@ -23,7 +23,7 @@
 /// SOFTWARE.
 ///
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
 // Bounds:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S
index c94746e17..857c729cb 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S
@@ -23,7 +23,7 @@
 /// SOFTWARE.
 ///
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
 // Bounds:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S
index cd63cc4d6..30fdc76b0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S
@@ -24,7 +24,7 @@
 /// SOFTWARE.
 ///
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
 // Bounds:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_opt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_opt.S
index 8705615b7..431f9dc6f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_opt.S
@@ -24,7 +24,7 @@
 /// SOFTWARE.
 ///
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
 // Bounds:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/opt_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/opt_impl.h
similarity index 58%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/opt_impl.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/opt_impl.h
index ec1bf6587..eb8e39ed0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/opt_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/opt_impl.h
@@ -10,11 +10,9 @@
 #else
 #define MLKEM_NATIVE_ARITH_PROFILE_IMPL_H
 
+#include "../../../params.h"
 #include "arith_native_aarch64.h"
 
-#include "poly.h"
-#include "polyvec.h"
-
 /* Set of primitives that this backend replaces */
 #define MLKEM_USE_NATIVE_NTT
 #define MLKEM_USE_NATIVE_INTT
@@ -25,45 +23,46 @@
 #define MLKEM_USE_NATIVE_POLY_TOBYTES
 #define MLKEM_USE_NATIVE_REJ_UNIFORM
 
-static INLINE void ntt_native(poly *data)
+static INLINE void ntt_native(int16_t data[MLKEM_N])
 {
-  ntt_asm_opt(data->coeffs, aarch64_ntt_zetas_layer01234,
-              aarch64_ntt_zetas_layer56);
+  ntt_asm_opt(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56);
 }
 
-static INLINE void intt_native(poly *data)
+static INLINE void intt_native(int16_t data[MLKEM_N])
 {
-  intt_asm_opt(data->coeffs, aarch64_invntt_zetas_layer01234,
+  intt_asm_opt(data, aarch64_invntt_zetas_layer01234,
                aarch64_invntt_zetas_layer56);
 }
 
-static INLINE void poly_reduce_native(poly *data)
+static INLINE void poly_reduce_native(int16_t data[MLKEM_N])
 {
-  poly_reduce_asm_opt(data->coeffs);
+  poly_reduce_asm_opt(data);
 }
-static INLINE void poly_tomont_native(poly *data)
+
+static INLINE void poly_tomont_native(int16_t data[MLKEM_N])
 {
-  poly_tomont_asm_opt(data->coeffs);
+  poly_tomont_asm_opt(data);
 }
 
-static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y)
+static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
+                                                const int16_t y[MLKEM_N])
 {
-  poly_mulcache_compute_asm_opt(x->coeffs, y->coeffs,
-                                aarch64_zetas_mulcache_native,
+  poly_mulcache_compute_asm_opt(x, y, aarch64_zetas_mulcache_native,
                                 aarch64_zetas_mulcache_twisted_native);
 }
+
 static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache)
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)])
 {
-  polyvec_basemul_acc_montgomery_cached_asm_opt(
-      r->coeffs, a->vec[0].coeffs, b->vec[0].coeffs, b_cache->vec[0].coeffs);
+  polyvec_basemul_acc_montgomery_cached_asm_opt(r, a, b, b_cache);
 }
 
 static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a)
+                                       const int16_t a[MLKEM_N])
 {
-  poly_tobytes_asm_opt(r, a->coeffs);
+  poly_tobytes_asm_opt(r, a);
 }
 
 static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/optimize.sh b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/optimize.sh
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/optimize.sh
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/optimize.sh
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_clean.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_clean.S
index 809f9667e..f3ee0796f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_clean.S
@@ -3,7 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
 /*
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S
index 815a9dd1a..555c60a67 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S
@@ -3,7 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
 /*
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S
index c91675b44..0b6df6345 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S
@@ -9,7 +9,7 @@
 // https://eprint.iacr.org/2021/986
 // https://github.com/neon-ntt/neon-ntt
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
 // Input:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S
index 8300b682c..7a27fda3e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S
@@ -9,7 +9,7 @@
 // https://eprint.iacr.org/2021/986
 // https://github.com/neon-ntt/neon-ntt
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
 // Input:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
index 5151a05d0..9158d6c82 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
@@ -18,7 +18,7 @@
  *
  * Returns number of sampled 16-bit integers (at most MLKEM_N).
  **************************************************/
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
     defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_table.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c
index 507660349..29cdbe95f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_table.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c
@@ -8,7 +8,7 @@
  *          Do not modify it directly.
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
     defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/api.h
similarity index 90%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/api.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/api.h
index 792ecb8a4..0704f9dcd 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/api.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/api.h
@@ -23,8 +23,7 @@
 #define MLKEM_NATIVE_ARITH_NATIVE_API_H
 
 #include <stdint.h>
-#include "poly.h"
-#include "polyvec.h"
+#include "../common.h"
 
 /*
  * This is the C<->native interface allowing for the drop-in of
@@ -65,9 +64,9 @@
  *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
  *              for more information.
  *
- * Arguments:   - poly *p: pointer to in/output polynomial
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
  **************************************************/
-static INLINE void ntt_native(poly *);
+static INLINE void ntt_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_NTT */
 
 #if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
@@ -96,10 +95,10 @@ and to/from bytes conversions."
  *
  *              This must only be defined if there is native code for
  *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
- * Arguments:   - poly *p: pointer to in/output polynomial
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
  *
  **************************************************/
-static INLINE void poly_permute_bitrev_to_custom(poly *);
+static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
 
 #if defined(MLKEM_USE_NATIVE_INTT)
@@ -117,7 +116,7 @@ static INLINE void poly_permute_bitrev_to_custom(poly *);
  *
  * Arguments:   - uint16_t *a: pointer to in/output polynomial
  **************************************************/
-static INLINE void intt_native(poly *);
+static INLINE void intt_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_INTT */
 
 #if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
@@ -126,9 +125,9 @@ static INLINE void intt_native(poly *);
  *
  * Description: Applies modular reduction to all coefficients of a polynomial.
  *
- * Arguments:   - poly *r: pointer to input/output polynomial
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
  **************************************************/
-static INLINE void poly_reduce_native(poly *);
+static INLINE void poly_reduce_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
 
 #if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
@@ -138,9 +137,9 @@ static INLINE void poly_reduce_native(poly *);
  * Description: Inplace conversion of all coefficients of a polynomial
  *              from normal domain to Montgomery domain
  *
- * Arguments:   - poly *r: pointer to input/output polynomial
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
  **************************************************/
-static INLINE void poly_tomont_native(poly *);
+static INLINE void poly_tomont_native(int16_t p[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
 #if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
@@ -165,8 +164,8 @@ static INLINE void poly_tomont_native(poly *);
  *              OUTPUT
  *              - cache: pointer to multiplication cache
  **************************************************/
-static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
-                                                const poly *poly);
+static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2],
+                                                const int16_t poly[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 
 #if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
@@ -189,8 +188,9 @@ static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
  *                   in NTT domain, and of the same order as a and b.
  **************************************************/
 static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache);
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]);
 #endif
 
 #if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
@@ -209,7 +209,7 @@ static INLINE void polyvec_basemul_acc_montgomery_cached_native(
  *                   (of MLKEM_POLYBYTES bytes)
  **************************************************/
 static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a);
+                                       const int16_t a[MLKEM_N]);
 #endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
 
 #if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
@@ -226,7 +226,7 @@ static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
  *              - a: const pointer to input byte aray
  *                   (of MLKEM_POLYBYTES bytes)
  **************************************************/
-static INLINE void poly_frombytes_native(poly *a,
+static INLINE void poly_frombytes_native(int16_t a[MLKEM_N],
                                          const uint8_t r[MLKEM_POLYBYTES]);
 #endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/default.h
similarity index 97%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/default.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/default.h
index d1e41c52e..f9fe4310a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/default.h
@@ -8,7 +8,7 @@
 /*
  * Default arithmetic backend
  */
-#include "sys.h"
+#include "../sys.h"
 
 #ifdef SYS_AARCH64
 /*
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.c
deleted file mode 100644
index 3651c8da9..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-
-#include <stdint.h>
-#include "arith_backend.h"
-#include "debug.h"
-#include "ntt.h"
-#include "reduce.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
-#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
-#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
-/* End of static namespacing */
-
-#if !defined(MLKEM_USE_NATIVE_NTT)
-/*
- * Computes a block CT butterflies with a fixed twiddle factor,
- * using Montgomery multiplication.
- * Parameters:
- * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
- * - root: Twiddle factor to use for the butterfly. This must be in
- *         Montgomery form and signed canonical.
- * - start: Offset to the beginning of the butterfly block
- * - len: Index difference between coefficients subject to a butterfly
- * - bound: Ghost variable describing coefficient bound: Prior to `start`,
- *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
- *          they must be bound by `bound`.
- * When this function returns, output coefficients in the index range
- * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
- * Example:
- * - start=8, len=4
- *   This would compute the following four butterflies
- *          8     --    12
- *             9    --     13
- *                10   --     14
- *                   11   --     15
- * - start=4, len=2
- *   This would compute the following two butterflies
- *          4 -- 6
- *             5 -- 7
- */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
-                                unsigned start, unsigned len, int bound)
-__contract__(
-  requires(start < MLKEM_N)
-  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
-  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
-  requires(-HALF_Q < zeta && zeta < HALF_Q)
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
-  requires(array_abs_bound(r, start, MLKEM_N, bound))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
-  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
-{
-  /* `bound` is a ghost variable only needed in the CBMC specification */
-  unsigned j;
-  ((void)bound);
-  for (j = start; j < start + len; j++)
-  __loop__(
-    invariant(start <= j && j <= start + len)
-    /*
-     * Coefficients are updated in strided pairs, so the bounds for the
-     * intermediate states alternate twice between the old and new bound
-     */
-    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j,           start + len, bound))
-    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
-  {
-    int16_t t;
-    t = fqmul(r[j + len], zeta);
-    r[j + len] = r[j] - t;
-    r[j] = r[j] + t;
-  }
-}
-
-/*
- *Compute one layer of forward NTT
- * Parameters:
- * - r: Pointer to base of polynomial
- * - len: Stride of butterflies in this layer.
- * - layer: Ghost variable indicating which layer is being applied.
- *          Must match `len` via `len == MLKEM_N >> layer`.
- * Note: `len` could be dropped and computed in the function, but
- *   we are following the structure of the reference NTT from the
- *   official Kyber implementation here, merely adding `layer` as
- *   a ghost variable for the specifications.
- */
-static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
-  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable only needed in the CBMC specification */
-  ((void)layer);
-  /* Twiddle factors for layer n start at index 2^(layer-1) */
-  k = MLKEM_N / (2 * len);
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(start < MLKEM_N + 2 * len)
-    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
-    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
-    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
-  {
-    int16_t zeta = zetas[k++];
-    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
-  }
-}
-
-/*
- * Compute full forward NTT
- * NOTE: This particular implementation satisfies a much tighter
- * bound on the output coefficients (5*q) than the contractual one (8*q),
- * but this is not needed in the calling code. Should we change the
- * base multiplication strategy to require smaller NTT output bounds,
- * the proof may need strengthening.
- */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  unsigned len, layer;
-  int16_t *r;
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  r = p->coeffs;
-
-  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
-  __loop__(
-    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
-    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
-  {
-    ntt_layer(r, len, layer);
-  }
-
-  /* Check the stronger bound */
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_NTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  ntt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_NTT */
-
-#if !defined(MLKEM_USE_NATIVE_INTT)
-
-/* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
-  requires(len == (1 << (8 - layer)))
-  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable used only in the specification */
-  ((void)layer);
-  k = MLKEM_N / len - 1;
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(start <= MLKEM_N && k <= 127)
-    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
-    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
-  {
-    unsigned j;
-    int16_t zeta = zetas[k--];
-    for (j = start; j < start + len; j++)
-    __loop__(
-      invariant(start <= j && j <= start + len)
-      invariant(start <= MLKEM_N && k <= 127)
-      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-    {
-      int16_t t = r[j];
-      r[j] = barrett_reduce(t + r[j + len]);
-      r[j + len] = r[j + len] - t;
-      r[j + len] = fqmul(r[j + len], zeta);
-    }
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  /*
-   * Scale input polynomial to account for Montgomery factor
-   * and NTT twist. This also brings coefficients down to
-   * absolute value < MLKEM_Q.
-   */
-  unsigned j, len, layer;
-  const int16_t f = 1441;
-  int16_t *r = p->coeffs;
-
-  for (j = 0; j < MLKEM_N; j++)
-  __loop__(
-    invariant(j <= MLKEM_N)
-    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
-  {
-    r[j] = fqmul(r[j], f);
-  }
-
-  /* Run the invNTT layers */
-  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
-  __loop__(
-    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-  {
-    invntt_layer(p->coeffs, len, layer);
-  }
-
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  intt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-{
-  int32_t t0, t1;
-  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
-
-  t0 = (int32_t)a[1] * b_cached;
-  t0 += (int32_t)a[0] * b[0];
-  t1 = (int32_t)a[0] * b[1];
-  t1 += (int32_t)a[1] * b[0];
-
-  /* |ti| < 2 * q * 2^15 */
-  r[0] = montgomery_reduce(t0);
-  r[1] = montgomery_reduce(t1);
-
-  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
-}
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
-int empty_cu_ntt;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.h
deleted file mode 100644
index 4e80d3ab3..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/ntt.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef NTT_H
-#define NTT_H
-#include "common.h"
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "poly.h"
-#include "reduce.h"
-
-#define zetas MLKEM_NAMESPACE(zetas)
-extern const int16_t zetas[128];
-
-#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
-/*************************************************
- * Name:        poly_ntt
- *
- * Description: Computes negacyclic number-theoretic transform (NTT) of
- *              a polynomial in place.
- *
- *              The input is assumed to be in normal order and
- *              coefficient-wise bound by MLKEM_Q in absolute value.
- *
- *              The output polynomial is in bitreversed order, and
- *              coefficient-wise bound by NTT_BOUND in absolute value.
- *
- *              (NOTE: Sometimes the input to the NTT is actually smaller,
- *               which gives better bounds.)
- *
- * Arguments:   - poly *p: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
-);
-
-#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
-/*************************************************
- * Name:        poly_invntt_tomont
- *
- * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
- *              of a polynomial in place;
- *              inputs assumed to be in bitreversed order, output in normal
- *              order
- *
- *              The input is assumed to be in bitreversed order, and can
- *              have arbitrary coefficients in int16_t.
- *
- *              The output polynomial is in normal order, and
- *              coefficient-wise bound by INVNTT_BOUND in absolute value.
- *
- * Arguments:   - uint16_t *a: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
-);
-
-#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
-/************************************************************
- * Name: basemul_cached
- *
- * Description: Computes a representative modulo q of
- *              (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536
- *
- *              If b_cached is b1*zeta, this represents the
- *              product of (a0 + a1*X) and (b0 + b1*X) in
- *              Fq[X]/(X^2 - zeta).
- *
- * Arguments: - r: Pointer to output polynomial
- *                   Upon return, coefficients are bound by
- *                   2*MLKEM_Q in absolute value.
- *            - a: Pointer to first input polynomial
- *                   Every coefficient must be in [0..4095]
- *            - b: Pointer to second input polynomial
- *                   Can have arbitrary int16_t coefficients
- *            - b_cached: Some precomputed value, typically derived from
- *                   b1 and a twiddle factor. Can be an arbitary int16_t.
- ************************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-__contract__(
-  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
-  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
-  assigns(memory_slice(r, 2 * sizeof(int16_t)))
-  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
-);
-
-#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/params.h
index 57ea4c8ba..7f6c12625 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/params.h
@@ -18,6 +18,7 @@
 #define MLKEM_N 256
 #define MLKEM_Q 3329
 #define UINT12_LIMIT 4096
+#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
 
 #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */
 #define MLKEM_SSBYTES 32  /* size in bytes of shared key */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.c
index 7483ebf6d..e8a2e2c6e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.c
@@ -8,388 +8,246 @@
 #include <stdint.h>
 #include <string.h>
 #include "arith_backend.h"
-#include "cbd.h"
 #include "cbmc.h"
 #include "debug.h"
 #include "fips202x4.h"
-#include "ntt.h"
 #include "poly.h"
-#include "reduce.h"
+#include "sampling.h"
 #include "symmetric.h"
 #include "verify.h"
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j <= MLKEM_N / 4))
-  {
-    unsigned k;
-    uint16_t t[4];
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(forall(r, 0, k, t[r] < (1u << 10))))
-    {
-      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
-    }
-
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 10-bit in size.
-     */
-    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
-    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
-    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
-    r[5 * j + 4] = (t[3] >> 2);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
-  {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
+#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
+#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
+#define fqmul MLKEM_NAMESPACE(fqmul)
+#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
+#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
+#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
+#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
+#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
+#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        cast_uint16_to_int16
+ *
+ * Description: Cast uint16 value to int16
+ *
+ * Returns:
+ *   input x in     0 .. 32767: returns value unchanged
+ *   input x in 32768 .. 65535: returns (x - 65536)
+ **************************************************/
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+ALWAYS_INLINE
+static INLINE int16_t cast_uint16_to_int16(uint16_t x)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 4)
-    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[4];
-    uint8_t const *base = &a[5 * j];
-
-    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
-    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
-    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
-
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  /*
+   * PORTABILITY: This relies on uint16_t -> int16_t
+   * being implemented as the inverse of int16_t -> uint16_t,
+   * which is implementation-defined (C99 6.3.1.3 (3))
+   * CBMC (correctly) fails to prove this conversion is OK,
+   * so we have to suppress that check here
+   */
+  return (int16_t)x;
 }
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+/*************************************************
+ * Name:        montgomery_reduce_generic
+ *
+ * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
+ *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
+ *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
+ *
+ **************************************************/
+ALWAYS_INLINE
+static INLINE int16_t montgomery_reduce_generic(int32_t a)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
+  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 32)))
-    {
-      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
-    }
+  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
+  const uint16_t a_reduced = a & UINT16_MAX;
+  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
 
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC, and use array indexing into
-     * r rather than pointer-arithmetic to simplify verification
-     */
-    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
-    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
-    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
-    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
-    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
-  }
-}
+  /* Lift to signed canonical representative mod 2^16. */
+  const int16_t t = cast_uint16_to_int16(a_inverted);
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  int32_t r = a - ((int32_t)t * MLKEM_Q);
+  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
 
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j <= MLKEM_N / 8))
-  {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
-    {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
-    }
+  /*
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  r = r >> 16;
+  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
+   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
+   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *
+   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
+   */
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
-  }
+  return (int16_t)r;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+/*************************************************
+ * Name:        montgomery_reduce
+ *
+ * Description: Montgomery reduction
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q,
+ *              smaller than 2 * q in absolute value.
+ **************************************************/
+static INLINE int16_t montgomery_reduce(int32_t a)
+__contract__(
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
+  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
+)
 {
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    uint8_t t[8];
-    const unsigned offset = i * 5;
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC and unwind loop for ease
-     * of proof.
-     */
-
-    /*
-     * Decompress 5 8-bit bytes (so 40 bits) into
-     * 8 5-bit values stored in t[]
-     */
-    t[0] = 0x1F & (a[offset + 0] >> 0);
-    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
-    t[2] = 0x1F & (a[offset + 1] >> 2);
-    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
-    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
-    t[5] = 0x1F & (a[offset + 3] >> 1);
-    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
-    t[7] = 0x1F & (a[offset + 4] >> 3);
-
-    /* and copy to the correct slice in r[] */
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(j <= 8 && i <= MLKEM_N / 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  int16_t res;
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
+
+  res = montgomery_reduce_generic(a);
+  /* Bounds:
+   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
+   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
+   *        < 2 * MLKEM_Q */
+
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
+  return res;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+    !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        fqmul
+ *
+ * Description: Montgomery multiplication modulo q=3329
+ *
+ * Arguments:   - int16_t a: first factor
+ *                  Can be any int16_t.
+ *              - int16_t b: second factor.
+ *                  Must be signed canonical (abs value <(q+1)/2)
+ *
+ * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
+ * smaller than q in absolute value.
+ *
+ **************************************************/
+static INLINE int16_t fqmul(int16_t a, int16_t b)
+__contract__(
+  requires(b > -HALF_Q)
+  requires(b < HALF_Q)
+  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
+)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
-  }
+  int16_t res;
+  debug_assert_abs_bound(&b, 1, HALF_Q);
+
+  res = montgomery_reduce((int32_t)a * (int32_t)b);
+  /* Bounds:
+   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
+   *        < MLKEM_Q
+   */
 
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
+  return res;
 }
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
-
-#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+          !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+          !defined(MLKEM_USE_NATIVE_NTT) ||                   \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        barrett_reduce
+ *
+ * Description: Barrett reduction; given a 16-bit integer a, computes
+ *              centered representative congruent to a mod q in
+ *              {-(q-1)/2,...,(q-1)/2}
+ *
+ * Arguments:   - int16_t a: input integer to be reduced
+ *
+ * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
+ **************************************************/
+static INLINE int16_t barrett_reduce(int16_t a)
+__contract__(
+  ensures(return_value > -HALF_Q && return_value < HALF_Q)
+)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i <= MLKEM_N / 2))
-  {
-    const uint16_t t0 = a->coeffs[2 * i];
-    const uint16_t t1 = a->coeffs[2 * i + 1];
-    /*
-     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
-     * significant data, so these can be packed into 24 bits or exactly
-     * 3 bytes, as follows.
-     */
-
-    /* Least significant bits 0 - 7 of t0. */
-    r[3 * i + 0] = t0 & 0xFF;
-
-    /*
-     * Most significant bits 8 - 11 of t0 become the least significant
-     * nibble of the second byte. The least significant 4 bits
-     * of t1 become the upper nibble of the second byte.
-     */
-    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+  /*
+   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
+   * multiplier is round_to_nearest(2**26/MLKEM_Q)
+   */
+  const int BPOWER = 26;
+  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
 
-    /* Bits 4 - 11 of t1 become the third byte. */
-    r[3 * i + 2] = t1 >> 4;
-  }
-}
-#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-{
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  poly_tobytes_native(r, a);
-}
-#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+  /*
+   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
+   * above and shift by BPOWER places.
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
 
-#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
-  {
-    const uint8_t t0 = a[3 * i + 0];
-    const uint8_t t1 = a[3 * i + 1];
-    const uint8_t t2 = a[3 * i + 2];
-    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
-    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
-  }
+  /*
+   * t is in -10 .. +10, so we need 32-bit math to
+   * evaluate t * MLKEM_Q and the subsequent subtraction
+   */
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
 
-  /* Note that the coefficients are not canonical */
-  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
-}
-#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  poly_frombytes_native(r, a);
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
-#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
+                           int16_t b_cached)
+__contract__(
+  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
+  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
+  assigns(memory_slice(r, 2 * sizeof(int16_t)))
+  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)))
 {
-  unsigned i;
-#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
-#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
-#endif
+  int32_t t0, t1;
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <  MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      /* Prevent the compiler from recognizing this as a bit selection */
-      uint8_t mask = value_barrier_u8(1u << j);
-      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
-    }
-  }
-  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
-}
+  t0 = (int32_t)a[1] * b_cached;
+  t0 += (int32_t)a[0] * b[0];
+  t1 = (int32_t)a[0] * b[1];
+  t1 += (int32_t)a[1] * b[0];
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* |ti| < 2 * q * 2^15 */
+  r[0] = montgomery_reduce(t0);
+  r[1] = montgomery_reduce(t1);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    msg[i] = 0;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8))
-    {
-      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
-      msg[i] |= t << j;
-    }
-  }
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -434,12 +292,46 @@ void poly_tomont(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
-  poly_tomont_native(r);
+  poly_tomont_native(r->coeffs);
   debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
 #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/************************************************************
+ * Name: scalar_signed_to_unsigned_q
+ *
+ * Description: converts signed polynomial coefficient
+ *              from signed (-3328 .. 3328) form to
+ *              unsigned form (0 .. 3328).
+ *
+ * Note: Cryptographic constant time implementation
+ *
+ * Examples:       0 -> 0
+ *                 1 -> 1
+ *              3328 -> 3328
+ *                -1 -> 3328
+ *                -2 -> 3327
+ *             -3328 -> 1
+ *
+ * Arguments: c: signed coefficient to be converted
+ ************************************************************/
+static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
+__contract__(
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
+  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
+{
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
+  /* Add Q if c is negative, but in constant time */
+  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
+
+  /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
+  return (uint16_t)c;
+}
+
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
@@ -461,7 +353,7 @@ void poly_reduce(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
-  poly_reduce_native(r);
+  poly_reduce_native(r->coeffs);
   debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
@@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
-  poly_mulcache_compute_native(x, a);
+  poly_mulcache_compute_native(x->coeffs, a->coeffs);
   /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 
+#if !defined(MLKEM_USE_NATIVE_NTT)
+/*
+ * Computes a block CT butterflies with a fixed twiddle factor,
+ * using Montgomery multiplication.
+ * Parameters:
+ * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
+ * - root: Twiddle factor to use for the butterfly. This must be in
+ *         Montgomery form and signed canonical.
+ * - start: Offset to the beginning of the butterfly block
+ * - len: Index difference between coefficients subject to a butterfly
+ * - bound: Ghost variable describing coefficient bound: Prior to `start`,
+ *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
+ *          they must be bound by `bound`.
+ * When this function returns, output coefficients in the index range
+ * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
+ * Example:
+ * - start=8, len=4
+ *   This would compute the following four butterflies
+ *          8     --    12
+ *             9    --     13
+ *                10   --     14
+ *                   11   --     15
+ * - start=4, len=2
+ *   This would compute the following two butterflies
+ *          4 -- 6
+ *             5 -- 7
+ */
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
+__contract__(
+  requires(start < MLKEM_N)
+  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
+  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
+  requires(-HALF_Q < zeta && zeta < HALF_Q)
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
+  requires(array_abs_bound(r, start, MLKEM_N, bound))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
+  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
+{
+  /* `bound` is a ghost variable only needed in the CBMC specification */
+  unsigned j;
+  ((void)bound);
+  for (j = start; j < start + len; j++)
+  __loop__(
+    invariant(start <= j && j <= start + len)
+    /*
+     * Coefficients are updated in strided pairs, so the bounds for the
+     * intermediate states alternate twice between the old and new bound
+     */
+    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j,           start + len, bound))
+    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
+  {
+    int16_t t;
+    t = fqmul(r[j + len], zeta);
+    r[j + len] = r[j] - t;
+    r[j] = r[j] + t;
+  }
+}
+
+/*
+ *Compute one layer of forward NTT
+ * Parameters:
+ * - r: Pointer to base of polynomial
+ * - len: Stride of butterflies in this layer.
+ * - layer: Ghost variable indicating which layer is being applied.
+ *          Must match `len` via `len == MLKEM_N >> layer`.
+ * Note: `len` could be dropped and computed in the function, but
+ *   we are following the structure of the reference NTT from the
+ *   official Kyber implementation here, merely adding `layer` as
+ *   a ghost variable for the specifications.
+ */
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
+  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable only needed in the CBMC specification */
+  ((void)layer);
+  /* Twiddle factors for layer n start at index 2^(layer-1) */
+  k = MLKEM_N / (2 * len);
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
+    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
+  {
+    int16_t zeta = zetas[k++];
+    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
+  }
+}
+
+/*
+ * Compute full forward NTT
+ * NOTE: This particular implementation satisfies a much tighter
+ * bound on the output coefficients (5*q) than the contractual one (8*q),
+ * but this is not needed in the calling code. Should we change the
+ * base multiplication strategy to require smaller NTT output bounds,
+ * the proof may need strengthening.
+ */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  unsigned len, layer;
+  int16_t *r;
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  r = p->coeffs;
+
+  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
+  __loop__(
+    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
+    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
+  {
+    ntt_layer(r, len, layer);
+  }
+
+  /* Check the stronger bound */
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_NTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  ntt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if !defined(MLKEM_USE_NATIVE_INTT)
+
+/* Compute one layer of inverse NTT */
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
+  requires(len == (1 << (8 - layer)))
+  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable used only in the specification */
+  ((void)layer);
+  k = MLKEM_N / len - 1;
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+    invariant(start <= MLKEM_N && k <= 127)
+    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
+    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
+  {
+    unsigned j;
+    int16_t zeta = zetas[k--];
+    for (j = start; j < start + len; j++)
+    __loop__(
+      invariant(start <= j && j <= start + len)
+      invariant(start <= MLKEM_N && k <= 127)
+      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+    {
+      int16_t t = r[j];
+      r[j] = barrett_reduce(t + r[j + len]);
+      r[j + len] = r[j + len] - t;
+      r[j + len] = fqmul(r[j + len], zeta);
+    }
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  /*
+   * Scale input polynomial to account for Montgomery factor
+   * and NTT twist. This also brings coefficients down to
+   * absolute value < MLKEM_Q.
+   */
+  unsigned j, len, layer;
+  const int16_t f = 1441;
+  int16_t *r = p->coeffs;
+
+  for (j = 0; j < MLKEM_N; j++)
+  __loop__(
+    invariant(j <= MLKEM_N)
+    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
+  {
+    r[j] = fqmul(r[j], f);
+  }
+
+  /* Run the invNTT layers */
+  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
+  __loop__(
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+  {
+    invntt_layer(p->coeffs, len, layer);
+  }
+
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_INTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  intt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_INTT */
+
 #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.h
index 6a14c785d..cb0d67c1a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly.h
@@ -9,7 +9,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "reduce.h"
+#include "debug.h"
 #include "verify.h"
 
 /* Absolute exclusive upper bound for the output of the inverse NTT */
@@ -18,6 +18,9 @@
 /* Absolute exclusive upper bound for the output of the forward NTT */
 #define NTT_BOUND (8 * MLKEM_Q)
 
+#define zetas MLKEM_NAMESPACE(zetas)
+extern const int16_t zetas[128];
+
 /*
  * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
  * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
@@ -38,520 +41,6 @@ typedef struct
   int16_t coeffs[MLKEM_N >> 1];
 } poly_mulcache;
 
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
-#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
-#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
-#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
-#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
-#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
-#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
-#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
-#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
-#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
-/* End of static namespacing */
-
-/************************************************************
- * Name: scalar_compress_d1
- *
- * Description: Computes round(u * 2 / q)
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 1.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d1(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 2)
-  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
-{
-  uint32_t d0 = u << 1;
-  d0 *= 645083;
-  d0 += 1u << 30;
-  d0 >>= 31;
-  return d0;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_compress_d4
- *
- * Description: Computes round(u * 16 / q) % 16
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d4(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 16)
-  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
-{
-  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
-  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d4
- *
- * Description: Computes round(u * q / 16)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d4(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 16)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 8) / 16; }
-
-/************************************************************
- * Name: scalar_compress_d5
- *
- * Description: Computes round(u * 32 / q) % 32
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d5(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 32)
-  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
-{
-  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
-  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d5
- *
- * Description: Computes round(u * q / 32)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 32
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d5(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 32)
-  ensures(return_value <= MLKEM_Q - 1)
-) { return ((u * MLKEM_Q) + 16) / 32; }
-
-/************************************************************
- * Name: scalar_compress_d10
- *
- * Description: Computes round(u * 2**10 / q) % 2**10
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d10(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 10))
-  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
-{
-  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x3FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d10
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d10(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 1024)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 512) / 1024; }
-
-/************************************************************
- * Name: scalar_compress_d11
- *
- * Description: Computes round(u * 2**11 / q) % 2**11
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 11.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d11(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 11))
-  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
-{
-  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x7FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d11
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d11(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 2048)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 1024) / 2048; }
-
-/************************************************************
- * Name: scalar_signed_to_unsigned_q
- *
- * Description: converts signed polynomial coefficient
- *              from signed (-3328 .. 3328) form to
- *              unsigned form (0 .. 3328).
- *
- * Note: Cryptographic constant time implementation
- *
- * Examples:       0 -> 0
- *                 1 -> 1
- *              3328 -> 3328
- *                -1 -> 3328
- *                -2 -> 3327
- *             -3328 -> 1
- *
- * Arguments: c: signed coefficient to be converted
- ************************************************************/
-static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
-__contract__(
-  requires(c > -MLKEM_Q && c < MLKEM_Q)
-  ensures(return_value >= 0 && return_value < MLKEM_Q)
-  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
-{
-  debug_assert_abs_bound(&c, 1, MLKEM_Q);
-
-  /* Add Q if c is negative, but in constant time */
-  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
-
-  /* and therefore cast to uint16_t is safe. */
-  debug_assert_bound(&c, 1, 0, MLKEM_Q);
-  return (uint16_t)c;
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
-    (MLKEM_K == 2 || MLKEM_K == 3)
-#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
-/*************************************************
- * Name:        poly_compress_d4
- *
- * Description: Compression (4 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
-
-#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
-/*************************************************
- * Name:        poly_compress_d10
- *
- * Description: Compression (10 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
-
-#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
-/*************************************************
- * Name:        poly_decompress_d4
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
-
-#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
-/*************************************************
- * Name:        poly_decompress_d10
- *
- * Description: De-serialization and subsequent decompression (10 bits) of a
- *              polynomial; approximate inverse of poly_compress_d10
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
-/*************************************************
- * Name:        poly_compress_d5
- *
- * Description: Compression (5 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
-
-#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
-/*************************************************
- * Name:        poly_compress_d11
- *
- * Description: Compression (11 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
-
-#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
-/*************************************************
- * Name:        poly_decompress_d5
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
-
-#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
-/*************************************************
- * Name:        poly_decompress_d11
- *
- * Description: De-serialization and subsequent decompression (11 bits) of a
- *              polynomial; approximate inverse of poly_compress_d11
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
-        */
-
-#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
-/*************************************************
- * Name:        poly_tobytes
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - a: const pointer to input polynomial,
- *                with each coefficient in the range [0,1,..,Q-1]
- *              OUTPUT
- *              - r: pointer to output byte array
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYBYTES))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
-
-
-#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
-/*************************************************
- * Name:        poly_frombytes
- *
- * Description: De-serialization of a polynomial.
- *
- * Arguments:   INPUT
- *              - a: pointer to input byte array
- *                   (of MLKEM_POLYBYTES bytes)
- *              OUTPUT
- *              - r: pointer to output polynomial, with
- *                   each coefficient unsigned and in the range
- *                   0 .. 4095
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
-);
-
-
-#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
-/*************************************************
- * Name:        poly_frommsg
- *
- * Description: Convert 32-byte message to polynomial
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *msg: pointer to input message
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
-
-#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
-/*************************************************
- * Name:        poly_tomsg
- *
- * Description: Convert polynomial to 32-byte message
- *
- * Arguments:   - uint8_t *msg: pointer to output message
- *              - const poly *r: pointer to input polynomial
- *                Coefficients must be unsigned canonical
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(msg))
-);
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -715,4 +204,56 @@ __contract__(
   assigns(object_whole(r))
 );
 
+#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
+/*************************************************
+ * Name:        poly_ntt
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ *              (NOTE: Sometimes the input to the NTT is actually smaller,
+ *               which gives better bounds.)
+ *
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
+);
+
+#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
+/*************************************************
+ * Name:        poly_invntt_tomont
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place;
+ *              inputs assumed to be in bitreversed order, output in normal
+ *              order
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
+);
+
 #endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly_k.c
similarity index 97%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly_k.c
index 50ea1c34a..c2d330ea9 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/polyvec.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly_k.c
@@ -2,13 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#include "polyvec.h"
+#include "poly_k.h"
 #include <stdint.h>
 #include <string.h>
 #include "arith_backend.h"
-#include "cbd.h"
-#include "ntt.h"
-#include "poly.h"
+#include "compress.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 #include "debug.h"
@@ -131,7 +130,9 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
   /* Omitting bounds assertion for cache since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
-  polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
+  polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a,
+                                               (const int16_t *)b,
+                                               (const int16_t *)b_cache);
 }
 #endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly_k.h
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly_k.h
index 8be8579e0..0aea95912 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/polyvec.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/poly_k.h
@@ -2,11 +2,12 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#ifndef POLYVEC_H
-#define POLYVEC_H
+#ifndef POLY_K_H
+#define POLY_K_H
 
 #include <stdint.h>
 #include "common.h"
+#include "compress.h"
 #include "poly.h"
 
 #define polyvec MLKEM_NAMESPACE_K(polyvec)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/reduce.h
deleted file mode 100644
index b432a4201..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/reduce.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "common.h"
-#include "debug.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
-#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
-#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
-#define fqmul MLKEM_NAMESPACE(fqmul)
-#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
-/* End of static namespacing */
-
-#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
-
-/*************************************************
- * Name:        cast_uint16_to_int16
- *
- * Description: Cast uint16 value to int16
- *
- * Returns:
- *   input x in     0 .. 32767: returns value unchanged
- *   input x in 32768 .. 65535: returns (x - 65536)
- **************************************************/
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
-ALWAYS_INLINE
-static INLINE int16_t cast_uint16_to_int16(uint16_t x)
-{
-  /*
-   * PORTABILITY: This relies on uint16_t -> int16_t
-   * being implemented as the inverse of int16_t -> uint16_t,
-   * which is implementation-defined (C99 6.3.1.3 (3))
-   * CBMC (correctly) fails to prove this conversion is OK,
-   * so we have to suppress that check here
-   */
-  return (int16_t)x;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/*************************************************
- * Name:        montgomery_reduce_generic
- *
- * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
- *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *
- * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
- *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
- *
- **************************************************/
-ALWAYS_INLINE
-static INLINE int16_t montgomery_reduce_generic(int32_t a)
-{
-  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
-  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
-
-  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
-  const uint16_t a_reduced = a & UINT16_MAX;
-  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
-
-  /* Lift to signed canonical representative mod 2^16. */
-  const int16_t t = cast_uint16_to_int16(a_inverted);
-
-  int32_t r = a - ((int32_t)t * MLKEM_Q);
-  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
-
-  /*
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  r = r >> 16;
-  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
-   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
-   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *
-   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
-   */
-
-  return (int16_t)r;
-}
-
-/*************************************************
- * Name:        montgomery_reduce
- *
- * Description: Montgomery reduction
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
- *
- * Returns:     integer congruent to a * R^-1 modulo q,
- *              smaller than 2 * q in absolute value.
- **************************************************/
-static INLINE int16_t montgomery_reduce(int32_t a)
-__contract__(
-  requires(a > -(2 * UINT12_LIMIT * 32768))
-  requires(a <  (2 * UINT12_LIMIT * 32768))
-  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
-
-  res = montgomery_reduce_generic(a);
-  /* Bounds:
-   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
-   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
-   *        < 2 * MLKEM_Q */
-
-  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        fqmul
- *
- * Description: Montgomery multiplication modulo q=3329
- *
- * Arguments:   - int16_t a: first factor
- *                  Can be any int16_t.
- *              - int16_t b: second factor.
- *                  Must be signed canonical (abs value <(q+1)/2)
- *
- * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
- * smaller than q in absolute value.
- *
- **************************************************/
-static INLINE int16_t fqmul(int16_t a, int16_t b)
-__contract__(
-  requires(b > -HALF_Q)
-  requires(b < HALF_Q)
-  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&b, 1, HALF_Q);
-
-  res = montgomery_reduce((int32_t)a * (int32_t)b);
-  /* Bounds:
-   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
-   *        < MLKEM_Q
-   */
-
-  debug_assert_abs_bound(&res, 1, MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        barrett_reduce
- *
- * Description: Barrett reduction; given a 16-bit integer a, computes
- *              centered representative congruent to a mod q in
- *              {-(q-1)/2,...,(q-1)/2}
- *
- * Arguments:   - int16_t a: input integer to be reduced
- *
- * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
- **************************************************/
-static INLINE int16_t barrett_reduce(int16_t a)
-__contract__(
-  ensures(return_value > -HALF_Q && return_value < HALF_Q)
-)
-{
-  /*
-   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
-   * multiplier is round_to_nearest(2**26/MLKEM_Q)
-   */
-  const int BPOWER = 26;
-  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
-
-  /*
-   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
-   * above and shift by BPOWER places.
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
-
-  /*
-   * t is in -10 .. +10, so we need 32-bit math to
-   * evaluate t * MLKEM_Q and the subsequent subtraction
-   */
-  int16_t res = (int16_t)(a - t * MLKEM_Q);
-
-  debug_assert_abs_bound(&res, 1, HALF_Q);
-  return res;
-}
-
-#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/sampling.c
similarity index 73%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/sampling.c
index cbbe4407f..98cbdcb74 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/rej_uniform.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/sampling.c
@@ -9,7 +9,7 @@
 #include "debug.h"
 #include "fips202.h"
 #include "fips202x4.h"
-#include "rej_uniform.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 /* Static namespacing
@@ -18,6 +18,8 @@
  * within a single compilation unit. */
 #define rej_uniform MLKEM_NAMESPACE(rej_uniform)
 #define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
 /* End of static namespacing */
 
 static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
@@ -233,9 +235,113 @@ void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
   xof_release(&state);
 }
 
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        load32_littleendian
+ *
+ * Description: load 4 bytes into a 32-bit integer
+ *              in little-endian order
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x
+ **************************************************/
+static uint32_t load32_littleendian(const uint8_t x[4])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  r |= (uint32_t)x[3] << 24;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
+  {
+    unsigned j;
+    uint32_t t = load32_littleendian(buf + 4 * i);
+    uint32_t d = t & 0x55555555;
+    d += (t >> 1) & 0x55555555;
+
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
+    {
+      const int16_t a = (d >> (4 * j + 0)) & 0x3;
+      const int16_t b = (d >> (4 * j + 2)) & 0x3;
+      r->coeffs[8 * i + j] = a - b;
+    }
+  }
+}
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+/*************************************************
+ * Name:        load24_littleendian
+ *
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+ **************************************************/
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
+  {
+    unsigned j;
+    const uint32_t t = load24_littleendian(buf + 3 * i);
+    uint32_t d = t & 0x00249249;
+    d += (t >> 1) & 0x00249249;
+    d += (t >> 2) & 0x00249249;
+
+    for (j = 0; j < 4; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 4 && j <= 4)
+      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
+    {
+      const int16_t a = (d >> (6 * j + 0)) & 0x7;
+      const int16_t b = (d >> (6 * j + 3)) & 0x7;
+      r->coeffs[4 * i + j] = a - b;
+    }
+  }
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
+
 #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
-#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
-int empty_cu_rej_uniform;
+#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling)
+int empty_cu_sampling;
 
 #endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/sampling.h
similarity index 63%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/sampling.h
index 801287259..cc524e0fc 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/rej_uniform.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/sampling.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2024 The mlkem-native project authors
  * SPDX-License-Identifier: Apache-2.0
  */
-#ifndef REJ_UNIFORM_H
-#define REJ_UNIFORM_H
+#ifndef SAMPLING_H
+#define SAMPLING_H
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -11,6 +11,37 @@
 #include "common.h"
 #include "poly.h"
 
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
+/*************************************************
+ * Name:        poly_cbd2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
+/*************************************************
+ * Name:        poly_cbd3
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
+
 #define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
 /*************************************************
  * Name:        poly_rej_uniform_x4
@@ -60,4 +91,4 @@ __contract__(
   assigns(memory_slice(entry, sizeof(poly)))
   ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
 
-#endif /* REJ_UNIFORM_H */
+#endif /* SAMPLING_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c
index 4ef887c62..987f0dce4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c
@@ -10,7 +10,7 @@
 
 #include "common.h"
 #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-#include "ntt.h"
+#include "poly.h"
 
 /*
  * Table of zeta values used in the reference NTT and inverse NTT.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/api.h
deleted file mode 100644
index 792ecb8a4..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/api.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * Native arithmetic interface
- *
- * This header is primarily for documentation purposes.
- * It should not be included by backend implementations.
- *
- * To ensure consistency with backends, the header will be
- * included automatically after inclusion of the active
- * backend, to ensure consistency of function signatures,
- * and run sanity checks.
- */
-#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H
-#error \
-    "The arithmetic backend API `mlkem/native/api.h` "		\
-    "should not be directly included. Please include the relevant "	\
-    "structure headers directly."
-#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
-#define MLKEM_NATIVE_ARITH_NATIVE_API_H
-
-#include <stdint.h>
-#include "poly.h"
-#include "polyvec.h"
-
-/*
- * This is the C<->native interface allowing for the drop-in of
- * native code for performance critical arithmetic components of ML-KEM.
- *
- * A _backend_ is a specific implementation of (part of) this interface.
- *
- * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and
- * implement `static inline xxx(...)` in the profile header.
- *
- * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can
- * be set if there are native implementations for all of NTT, invNTT, and
- * base multiplication, and allows the native implementation to use a
- * custom order of polynomial coefficients in NTT domain -- the use of such
- * custom order is not an implementation-detail since the public matrix
- * is generated in NTT domain. In this case, a permutation function
- * poly_permute_bitrev_to_custom() needs to be provided that permutes
- * polynomials in NTT domain from bitreversed to the custom order.
- */
-
-/*
- * Those functions are meant to be trivial wrappers around the chosen native
- * implementation. The are static inline to avoid unnecessary calls.
- * The macro before each declaration controls whether a native
- * implementation is present.
- */
-
-#if defined(MLKEM_USE_NATIVE_NTT)
-/*************************************************
- * Name:        ntt_native
- *
- * Description: Computes negacyclic number-theoretic transform (NTT) of
- *              a polynomial in place.
- *
- *              The input polynomial is assumed to be in normal order.
- *              The output polynomial is in bitreversed order, or of a
- *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *              for more information.
- *
- * Arguments:   - poly *p: pointer to in/output polynomial
- **************************************************/
-static INLINE void ntt_native(poly *);
-#endif /* MLKEM_USE_NATIVE_NTT */
-
-#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
-/*
- * This must only be set if NTT, invNTT, basemul, mulcache, and
- * to/from byte stream conversions all have native implementations
- * that are adapted to the custom order.
- */
-#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \
-    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) ||                  \
-    !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) ||  \
-    !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) ||                           \
-    !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-#error \
-    "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \
-set if there are native implementations for NTT, invNTT, mulcache, basemul, \
-and to/from bytes conversions."
-#endif
-
-/*************************************************
- * Name:        poly_permute_bitrev_to_custom
- *
- * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined,
- *              convert a polynomial in NTT domain from bitreversed
- *              order to the custom order output by the native NTT.
- *
- *              This must only be defined if there is native code for
- *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
- * Arguments:   - poly *p: pointer to in/output polynomial
- *
- **************************************************/
-static INLINE void poly_permute_bitrev_to_custom(poly *);
-#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
-
-#if defined(MLKEM_USE_NATIVE_INTT)
-/*************************************************
- * Name:        intt_native
- *
- * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
- *              of a polynomial in place.
- *
- *              The input polynomial is in bitreversed order, or of a
- *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *              for more information.
- *              The output polynomial is assumed to be in normal order.
- *
- * Arguments:   - uint16_t *a: pointer to in/output polynomial
- **************************************************/
-static INLINE void intt_native(poly *);
-#endif /* MLKEM_USE_NATIVE_INTT */
-
-#if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
-/*************************************************
- * Name:        poly_reduce_native
- *
- * Description: Applies modular reduction to all coefficients of a polynomial.
- *
- * Arguments:   - poly *r: pointer to input/output polynomial
- **************************************************/
-static INLINE void poly_reduce_native(poly *);
-#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
-
-#if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
-/*************************************************
- * Name:        poly_tomont_native
- *
- * Description: Inplace conversion of all coefficients of a polynomial
- *              from normal domain to Montgomery domain
- *
- * Arguments:   - poly *r: pointer to input/output polynomial
- **************************************************/
-static INLINE void poly_tomont_native(poly *);
-#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
-
-#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
-/*************************************************
- * Name:        poly_mulcache_compute_native
- *
- * Description: Compute multiplication cache for a polynomial
- *              in NTT domain.
- *
- *              The purpose of the multiplication cache is to
- *              cache repeated computations required during a
- *              base multiplication of polynomials in NTT domain.
- *              The structure of the multiplication-cache is
- *              implementation defined.
- *
- * Arguments:   INPUT:
- *              - poly: const pointer to input polynomial.
- *                  This must be in NTT domain and inin bitreversed order, or of
- *                  a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *                  See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *                  for more information.
- *              OUTPUT
- *              - cache: pointer to multiplication cache
- **************************************************/
-static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
-                                                const poly *poly);
-#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
-
-#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
-/*************************************************
- * Name:        poly_mulcache_compute_native
- *
- * Description: Compute multiplication of polynomials in NTT domain.
- *
- * Arguments:   INPUT:
- *              - a: First polynomial operand.
- *                 This must be in NTT domain and inin bitreversed order, or of
- *                 a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *                 See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *                 for more information.
- *              - b: Second polynomial operand.
- *                 As for a.
- *              - b_cache: Multiplication-cache for b.
- *              OUTPUT
- *              - r: Result of the base multiplication. This is again
- *                   in NTT domain, and of the same order as a and b.
- **************************************************/
-static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache);
-#endif
-
-#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
-/*************************************************
- * Name:        poly_tobytes_native
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - a: const pointer to input polynomial,
- *                with each coefficient in the range -Q+1 .. Q-1
- *              OUTPUT
- *              - r: pointer to output byte array
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a);
-#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
-
-#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-/*************************************************
- * Name:        poly_frombytes_native
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - r: pointer to output polynomial in NTT domain
- *              OUTPUT
- *              - a: const pointer to input byte aray
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-static INLINE void poly_frombytes_native(poly *a,
-                                         const uint8_t r[MLKEM_POLYBYTES]);
-#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-
-#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-/*************************************************
- * Name:        rej_uniform_native
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int len:    requested number of 16-bit integers
- *                                     (uniform mod q).
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes.
- *
- * Return -1 if the native implementation does not support the input lengths.
- * Otherwise, returns non-negative number of sampled 16-bit integers (at most
- * len).
- **************************************************/
-static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
-                                     const uint8_t *buf, unsigned int buflen);
-#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_backend.h
index 0543b1bd1..ade31cda1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/arith_backend.h
@@ -17,7 +17,7 @@
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
 #if defined(MLKEM_NATIVE_CHECK_APIS)
-#include "api.h"
+#include "native/api.h"
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.c
deleted file mode 100644
index 1e6b7c5d1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
-
-#include <stdint.h>
-#include "cbd.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
-#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-/* End of static namespacing */
-
-/*************************************************
- * Name:        load32_littleendian
- *
- * Description: load 4 bytes into a 32-bit integer
- *              in little-endian order
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x
- **************************************************/
-static uint32_t load32_littleendian(const uint8_t x[4])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  r |= (uint32_t)x[3] << 24;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
-  {
-    unsigned j;
-    uint32_t t = load32_littleendian(buf + 4 * i);
-    uint32_t d = t & 0x55555555;
-    d += (t >> 1) & 0x55555555;
-
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
-    {
-      const int16_t a = (d >> (4 * j + 0)) & 0x3;
-      const int16_t b = (d >> (4 * j + 2)) & 0x3;
-      r->coeffs[8 * i + j] = a - b;
-    }
-  }
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 4)
-    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
-  {
-    unsigned j;
-    const uint32_t t = load24_littleendian(buf + 3 * i);
-    uint32_t d = t & 0x00249249;
-    d += (t >> 1) & 0x00249249;
-    d += (t >> 2) & 0x00249249;
-
-    for (j = 0; j < 4; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 4 && j <= 4)
-      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
-    {
-      const int16_t a = (d >> (6 * j + 0)) & 0x7;
-      const int16_t b = (d >> (6 * j + 3)) & 0x7;
-      r->coeffs[4 * i + j] = a - b;
-    }
-  }
-}
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
-          3 */
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
-int empty_cu_cbd;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.h
deleted file mode 100644
index 54c1f5b90..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/cbd.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef CBD_H
-#define CBD_H
-
-#include <stdint.h>
-#include "common.h"
-#include "poly.h"
-
-#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
-/*************************************************
- * Name:        poly_cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
-/*************************************************
- * Name:        poly_cbd3
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
-
-#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h
index 4f326333e..62ed53ab1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h
@@ -15,12 +15,19 @@
 #include "sys.h"
 
 /* Include backend metadata */
-#if defined(MLKEM_USE_NATIVE)
-#if defined(MLKEM_NATIVE_ARITH_BACKEND)
-#include MLKEM_NATIVE_ARITH_BACKEND
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#include MLKEM_NATIVE_ARITH_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not.
+#endif
 #endif
-#if defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#include MLKEM_NATIVE_FIPS202_BACKEND
+
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#include MLKEM_NATIVE_FIPS202_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not.
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/compress.c
new file mode 100644
index 000000000..a03fe0ac4
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/compress.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
+#include <stdint.h>
+#include <string.h>
+#include "arith_backend.h"
+#include "cbmc.h"
+#include "compress.h"
+#include "debug.h"
+#include "verify.h"
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
+    {
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
+    }
+
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(invariant(j <= MLKEM_N / 4))
+  {
+    unsigned k;
+    uint16_t t[4];
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(forall(r, 0, k, t[r] < (1u << 10))))
+    {
+      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 10-bit in size.
+     */
+    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
+    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
+    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
+    r[5 * j + 4] = (t[3] >> 2);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  {
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 4)
+    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[4];
+    uint8_t const *base = &a[5 * j];
+
+    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
+    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
+    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
+
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 32)))
+    {
+      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
+    }
+
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC, and use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     */
+    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
+    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
+    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
+  {
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    uint8_t t[8];
+    const unsigned offset = i * 5;
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC and unwind loop for ease
+     * of proof.
+     */
+
+    /*
+     * Decompress 5 8-bit bytes (so 40 bits) into
+     * 8 5-bit values stored in t[]
+     */
+    t[0] = 0x1F & (a[offset + 0] >> 0);
+    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
+    t[2] = 0x1F & (a[offset + 1] >> 2);
+    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
+    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
+    t[5] = 0x1F & (a[offset + 3] >> 1);
+    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
+    t[7] = 0x1F & (a[offset + 4] >> 3);
+
+    /* and copy to the correct slice in r[] */
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(j <= 8 && i <= MLKEM_N / 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(invariant(i <= MLKEM_N / 2))
+  {
+    const uint16_t t0 = a->coeffs[2 * i];
+    const uint16_t t1 = a->coeffs[2 * i + 1];
+    /*
+     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
+     * significant data, so these can be packed into 24 bits or exactly
+     * 3 bytes, as follows.
+     */
+
+    /* Least significant bits 0 - 7 of t0. */
+    r[3 * i + 0] = t0 & 0xFF;
+
+    /*
+     * Most significant bits 8 - 11 of t0 become the least significant
+     * nibble of the second byte. The least significant 4 bits
+     * of t1 become the upper nibble of the second byte.
+     */
+    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+
+    /* Bits 4 - 11 of t1 become the third byte. */
+    r[3 * i + 2] = t1 >> 4;
+  }
+}
+#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  poly_tobytes_native(r, a->coeffs);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
+  {
+    const uint8_t t0 = a[3 * i + 0];
+    const uint8_t t1 = a[3 * i + 1];
+    const uint8_t t2 = a[3 * i + 2];
+    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
+    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+  }
+
+  /* Note that the coefficients are not canonical */
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  poly_frombytes_native(r->coeffs, a);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+{
+  unsigned i;
+#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
+#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
+#endif
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <  MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      /* Prevent the compiler from recognizing this as a bit selection */
+      uint8_t mask = value_barrier_u8(1u << j);
+      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
+    }
+  }
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    msg[i] = 0;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8))
+    {
+      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
+      msg[i] |= t << j;
+    }
+  }
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress)
+int empty_cu_compress;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/compress.h
new file mode 100644
index 000000000..409dbe519
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/compress.h
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef COMPRESS_H
+#define COMPRESS_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+#include "debug.h"
+#include "poly.h"
+#include "verify.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
+#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
+#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
+#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
+#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
+#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
+#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
+#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
+#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
+/* End of static namespacing */
+
+/************************************************************
+ * Name: scalar_compress_d1
+ *
+ * Description: Computes round(u * 2 / q)
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 1.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d1(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 2)
+  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
+{
+  uint32_t d0 = u << 1;
+  d0 *= 645083;
+  d0 += 1u << 30;
+  d0 >>= 31;
+  return d0;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_compress_d4
+ *
+ * Description: Computes round(u * 16 / q) % 16
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d4(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 16)
+  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
+{
+  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
+  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d4
+ *
+ * Description: Computes round(u * q / 16)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d4(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 16)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 8) / 16; }
+
+/************************************************************
+ * Name: scalar_compress_d5
+ *
+ * Description: Computes round(u * 32 / q) % 32
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d5(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 32)
+  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
+{
+  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
+  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d5
+ *
+ * Description: Computes round(u * q / 32)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 32
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d5(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 32)
+  ensures(return_value <= MLKEM_Q - 1)
+) { return ((u * MLKEM_Q) + 16) / 32; }
+
+/************************************************************
+ * Name: scalar_compress_d10
+ *
+ * Description: Computes round(u * 2**10 / q) % 2**10
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d10(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 10))
+  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
+{
+  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x3FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d10
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d10(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 1024)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 512) / 1024; }
+
+/************************************************************
+ * Name: scalar_compress_d11
+ *
+ * Description: Computes round(u * 2**11 / q) % 2**11
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 11.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d11(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 11))
+  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
+{
+  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x7FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d11
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d11(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 2048)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 1024) / 2048; }
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
+/*************************************************
+ * Name:        poly_compress_d4
+ *
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
+
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
+/*************************************************
+ * Name:        poly_decompress_d4
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
+/*************************************************
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
+
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
+/*************************************************
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
+
+#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
+/*************************************************
+ * Name:        poly_tobytes
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range [0,1,..,Q-1]
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYBYTES))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r))
+);
+
+
+#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
+/*************************************************
+ * Name:        poly_frombytes
+ *
+ * Description: De-serialization of a polynomial.
+ *
+ * Arguments:   INPUT
+ *              - a: pointer to input byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ *              OUTPUT
+ *              - r: pointer to output polynomial, with
+ *                   each coefficient unsigned and in the range
+ *                   0 .. 4095
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
+);
+
+
+#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
+/*************************************************
+ * Name:        poly_frommsg
+ *
+ * Description: Convert 32-byte message to polynomial
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *msg: pointer to input message
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
+#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
+/*************************************************
+ * Name:        poly_tomsg
+ *
+ * Description: Convert polynomial to 32-byte message
+ *
+ * Arguments:   - uint8_t *msg: pointer to output message
+ *              - const poly *r: pointer to input polynomial
+ *                Coefficients must be unsigned canonical
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(msg))
+);
+
+#endif /* COMPRESS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h
index fa89370ce..e975ede95 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h
@@ -122,46 +122,87 @@
 /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
- * Name:        MLKEM_USE_NATIVE
+ * Name:        MLKEM_USE_NATIVE_BACKEND_ARITH
  *
- * Description: Determines whether a native backend should
- *              be used, if available.
+ * Description: Determines whether an native arithmetic backend should be used.
+ *
+ *              The arithmetic backend covers performance critical functions
+ *              such as the number-theoretic transform (NTT).
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the arithmetic backend to be use is
+ *              determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(MLKEM_USE_NATIVE)
-/* #define MLKEM_USE_NATIVE */
+#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */
 #endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_ARITH_BACKEND
+ * Name:        MLKEM_NATIVE_ARITH_BACKEND_FILE
  *
  * Description: The arithmetic backend to use.
  *
- *              This must be the filename of an arithmetic backend.
- *              See the existing backends for examples.
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option
+ *              is ignored.
+ *
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must
+ *              either be undefined or the filename of an arithmetic backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND)
-#define MLKEM_NATIVE_ARITH_BACKEND "default.h"
-#endif /* MLKEM_NATIVE_ARITH_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \
+    !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h"
+#endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_FIPS202_BACKEND
+ * Name:        MLKEM_USE_NATIVE_BACKEND_FIPS202
+ *
+ * Description: Determines whether an native FIPS202 backend should be used.
+ *
+ *              The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is
+ *              the performance bottleneck of SHA3 and SHAKE.
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the FIPS202 backend to be use is
+ *              determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */
+#endif
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_FIPS202_BACKEND_FILE
  *
  * Description: The FIPS-202 backend to use.
  *
- *              This must be the filename of an FIPS-202 backend.
+ *              If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must
+ *              either be undefined or the filename of a FIPS202 backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h"
-#endif /* MLKEM_NATIVE_FIPS202_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \
+    !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h"
+#endif
 
 /*************************  Config internals  ********************************/
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/default.h
deleted file mode 100644
index d1e41c52e..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/default.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
-#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
-
-/*
- * Default arithmetic backend
- */
-#include "sys.h"
-
-#ifdef SYS_AARCH64
-/*
- * For AArch64, we currently we have one clean and one opt profile.
- * We default to the opt profile.
- *
- * In the future, this may branch further depending on the microarchitecture.
- */
-#include "aarch64/opt.h"
-#endif /* SYS_AARCH64 */
-
-#ifdef SYS_X86_64_AVX2
-/*
- * For now, there's only one x86_64 profile, based on
- * the AVX2 code from the Kyber repository.
- * https://github.com/pq-crystals/kyber
- */
-#include "x86_64/default.h"
-#endif /* SYS_X86_64 */
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c
index 0cfcc3e9e..318d0fc77 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.c
@@ -9,11 +9,10 @@
 #include "fips202.h"
 #include "fips202x4.h"
 #include "indcpa.h"
-#include "ntt.h"
 #include "poly.h"
-#include "polyvec.h"
+#include "poly_k.h"
 #include "randombytes.h"
-#include "rej_uniform.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 #include "arith_backend.h"
@@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v,
 #define poly_permute_bitrev_to_custom \
   MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
-static INLINE void poly_permute_bitrev_to_custom(poly *data)
+static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
 __contract__(
   /* We don't specify that this should be a permutation, but only
    * that it does not change the bound established at the end of gen_matrix. */
-  requires(memory_no_alias(data, sizeof(poly)))
-  requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
+  requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
   assigns(memory_slice(data, sizeof(poly)))
-  ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
+  ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
 #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
 
 /* Not static for benchmarking */
@@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
   {
     for (j = 0; j < MLKEM_K; j++)
     {
-      poly_permute_bitrev_to_custom(&a[i].vec[j]);
+      poly_permute_bitrev_to_custom(a[i].vec[j].coeffs);
     }
   }
 }
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h
index 2c4fda3c4..b4d5985bf 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/indcpa.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "polyvec.h"
+#include "poly_k.h"
 
 #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/api.h
new file mode 100644
index 000000000..0704f9dcd
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/api.h
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * Native arithmetic interface
+ *
+ * This header is primarily for documentation purposes.
+ * It should not be included by backend implementations.
+ *
+ * To ensure consistency with backends, the header will be
+ * included automatically after inclusion of the active
+ * backend, to ensure consistency of function signatures,
+ * and run sanity checks.
+ */
+#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H
+#error \
+    "The arithmetic backend API `mlkem/native/api.h` "		\
+    "should not be directly included. Please include the relevant "	\
+    "structure headers directly."
+#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
+#define MLKEM_NATIVE_ARITH_NATIVE_API_H
+
+#include <stdint.h>
+#include "../common.h"
+
+/*
+ * This is the C<->native interface allowing for the drop-in of
+ * native code for performance critical arithmetic components of ML-KEM.
+ *
+ * A _backend_ is a specific implementation of (part of) this interface.
+ *
+ * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and
+ * implement `static inline xxx(...)` in the profile header.
+ *
+ * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can
+ * be set if there are native implementations for all of NTT, invNTT, and
+ * base multiplication, and allows the native implementation to use a
+ * custom order of polynomial coefficients in NTT domain -- the use of such
+ * custom order is not an implementation-detail since the public matrix
+ * is generated in NTT domain. In this case, a permutation function
+ * poly_permute_bitrev_to_custom() needs to be provided that permutes
+ * polynomials in NTT domain from bitreversed to the custom order.
+ */
+
+/*
+ * Those functions are meant to be trivial wrappers around the chosen native
+ * implementation. The are static inline to avoid unnecessary calls.
+ * The macro before each declaration controls whether a native
+ * implementation is present.
+ */
+
+#if defined(MLKEM_USE_NATIVE_NTT)
+/*************************************************
+ * Name:        ntt_native
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input polynomial is assumed to be in normal order.
+ *              The output polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
+ **************************************************/
+static INLINE void ntt_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
+/*
+ * This must only be set if NTT, invNTT, basemul, mulcache, and
+ * to/from byte stream conversions all have native implementations
+ * that are adapted to the custom order.
+ */
+#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) ||                  \
+    !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) ||  \
+    !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) ||                           \
+    !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+#error \
+    "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \
+set if there are native implementations for NTT, invNTT, mulcache, basemul, \
+and to/from bytes conversions."
+#endif
+
+/*************************************************
+ * Name:        poly_permute_bitrev_to_custom
+ *
+ * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined,
+ *              convert a polynomial in NTT domain from bitreversed
+ *              order to the custom order output by the native NTT.
+ *
+ *              This must only be defined if there is native code for
+ *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
+ *
+ **************************************************/
+static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
+
+#if defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        intt_native
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place.
+ *
+ *              The input polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *              The output polynomial is assumed to be in normal order.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+static INLINE void intt_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_INTT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/*************************************************
+ * Name:        poly_reduce_native
+ *
+ * Description: Applies modular reduction to all coefficients of a polynomial.
+ *
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_reduce_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
+/*************************************************
+ * Name:        poly_tomont_native
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              from normal domain to Montgomery domain
+ *
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_tomont_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication cache for a polynomial
+ *              in NTT domain.
+ *
+ *              The purpose of the multiplication cache is to
+ *              cache repeated computations required during a
+ *              base multiplication of polynomials in NTT domain.
+ *              The structure of the multiplication-cache is
+ *              implementation defined.
+ *
+ * Arguments:   INPUT:
+ *              - poly: const pointer to input polynomial.
+ *                  This must be in NTT domain and inin bitreversed order, or of
+ *                  a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                  See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                  for more information.
+ *              OUTPUT
+ *              - cache: pointer to multiplication cache
+ **************************************************/
+static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2],
+                                                const int16_t poly[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication of polynomials in NTT domain.
+ *
+ * Arguments:   INPUT:
+ *              - a: First polynomial operand.
+ *                 This must be in NTT domain and inin bitreversed order, or of
+ *                 a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                 See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                 for more information.
+ *              - b: Second polynomial operand.
+ *                 As for a.
+ *              - b_cache: Multiplication-cache for b.
+ *              OUTPUT
+ *              - r: Result of the base multiplication. This is again
+ *                   in NTT domain, and of the same order as a and b.
+ **************************************************/
+static INLINE void polyvec_basemul_acc_montgomery_cached_native(
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]);
+#endif
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+/*************************************************
+ * Name:        poly_tobytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range -Q+1 .. Q-1
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+                                       const int16_t a[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+/*************************************************
+ * Name:        poly_frombytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - r: pointer to output polynomial in NTT domain
+ *              OUTPUT
+ *              - a: const pointer to input byte aray
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_frombytes_native(int16_t a[MLKEM_N],
+                                         const uint8_t r[MLKEM_POLYBYTES]);
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+/*************************************************
+ * Name:        rej_uniform_native
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int len:    requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes.
+ *
+ * Return -1 if the native implementation does not support the input lengths.
+ * Otherwise, returns non-negative number of sampled 16-bit integers (at most
+ * len).
+ **************************************************/
+static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
+                                     const uint8_t *buf, unsigned int buflen);
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/default.h
new file mode 100644
index 000000000..f9fe4310a
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/default.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
+#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
+
+/*
+ * Default arithmetic backend
+ */
+#include "../sys.h"
+
+#ifdef SYS_AARCH64
+/*
+ * For AArch64, we currently we have one clean and one opt profile.
+ * We default to the opt profile.
+ *
+ * In the future, this may branch further depending on the microarchitecture.
+ */
+#include "aarch64/opt.h"
+#endif /* SYS_AARCH64 */
+
+#ifdef SYS_X86_64_AVX2
+/*
+ * For now, there's only one x86_64 profile, based on
+ * the AVX2 code from the Kyber repository.
+ * https://github.com/pq-crystals/kyber
+ */
+#include "x86_64/default.h"
+#endif /* SYS_X86_64 */
+
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.c
deleted file mode 100644
index 3651c8da9..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-
-#include <stdint.h>
-#include "arith_backend.h"
-#include "debug.h"
-#include "ntt.h"
-#include "reduce.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
-#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
-#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
-/* End of static namespacing */
-
-#if !defined(MLKEM_USE_NATIVE_NTT)
-/*
- * Computes a block CT butterflies with a fixed twiddle factor,
- * using Montgomery multiplication.
- * Parameters:
- * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
- * - root: Twiddle factor to use for the butterfly. This must be in
- *         Montgomery form and signed canonical.
- * - start: Offset to the beginning of the butterfly block
- * - len: Index difference between coefficients subject to a butterfly
- * - bound: Ghost variable describing coefficient bound: Prior to `start`,
- *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
- *          they must be bound by `bound`.
- * When this function returns, output coefficients in the index range
- * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
- * Example:
- * - start=8, len=4
- *   This would compute the following four butterflies
- *          8     --    12
- *             9    --     13
- *                10   --     14
- *                   11   --     15
- * - start=4, len=2
- *   This would compute the following two butterflies
- *          4 -- 6
- *             5 -- 7
- */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
-                                unsigned start, unsigned len, int bound)
-__contract__(
-  requires(start < MLKEM_N)
-  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
-  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
-  requires(-HALF_Q < zeta && zeta < HALF_Q)
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
-  requires(array_abs_bound(r, start, MLKEM_N, bound))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
-  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
-{
-  /* `bound` is a ghost variable only needed in the CBMC specification */
-  unsigned j;
-  ((void)bound);
-  for (j = start; j < start + len; j++)
-  __loop__(
-    invariant(start <= j && j <= start + len)
-    /*
-     * Coefficients are updated in strided pairs, so the bounds for the
-     * intermediate states alternate twice between the old and new bound
-     */
-    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j,           start + len, bound))
-    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
-  {
-    int16_t t;
-    t = fqmul(r[j + len], zeta);
-    r[j + len] = r[j] - t;
-    r[j] = r[j] + t;
-  }
-}
-
-/*
- *Compute one layer of forward NTT
- * Parameters:
- * - r: Pointer to base of polynomial
- * - len: Stride of butterflies in this layer.
- * - layer: Ghost variable indicating which layer is being applied.
- *          Must match `len` via `len == MLKEM_N >> layer`.
- * Note: `len` could be dropped and computed in the function, but
- *   we are following the structure of the reference NTT from the
- *   official Kyber implementation here, merely adding `layer` as
- *   a ghost variable for the specifications.
- */
-static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
-  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable only needed in the CBMC specification */
-  ((void)layer);
-  /* Twiddle factors for layer n start at index 2^(layer-1) */
-  k = MLKEM_N / (2 * len);
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(start < MLKEM_N + 2 * len)
-    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
-    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
-    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
-  {
-    int16_t zeta = zetas[k++];
-    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
-  }
-}
-
-/*
- * Compute full forward NTT
- * NOTE: This particular implementation satisfies a much tighter
- * bound on the output coefficients (5*q) than the contractual one (8*q),
- * but this is not needed in the calling code. Should we change the
- * base multiplication strategy to require smaller NTT output bounds,
- * the proof may need strengthening.
- */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  unsigned len, layer;
-  int16_t *r;
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  r = p->coeffs;
-
-  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
-  __loop__(
-    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
-    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
-  {
-    ntt_layer(r, len, layer);
-  }
-
-  /* Check the stronger bound */
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_NTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  ntt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_NTT */
-
-#if !defined(MLKEM_USE_NATIVE_INTT)
-
-/* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
-  requires(len == (1 << (8 - layer)))
-  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable used only in the specification */
-  ((void)layer);
-  k = MLKEM_N / len - 1;
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(start <= MLKEM_N && k <= 127)
-    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
-    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
-  {
-    unsigned j;
-    int16_t zeta = zetas[k--];
-    for (j = start; j < start + len; j++)
-    __loop__(
-      invariant(start <= j && j <= start + len)
-      invariant(start <= MLKEM_N && k <= 127)
-      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-    {
-      int16_t t = r[j];
-      r[j] = barrett_reduce(t + r[j + len]);
-      r[j + len] = r[j + len] - t;
-      r[j + len] = fqmul(r[j + len], zeta);
-    }
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  /*
-   * Scale input polynomial to account for Montgomery factor
-   * and NTT twist. This also brings coefficients down to
-   * absolute value < MLKEM_Q.
-   */
-  unsigned j, len, layer;
-  const int16_t f = 1441;
-  int16_t *r = p->coeffs;
-
-  for (j = 0; j < MLKEM_N; j++)
-  __loop__(
-    invariant(j <= MLKEM_N)
-    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
-  {
-    r[j] = fqmul(r[j], f);
-  }
-
-  /* Run the invNTT layers */
-  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
-  __loop__(
-    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-  {
-    invntt_layer(p->coeffs, len, layer);
-  }
-
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  intt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-{
-  int32_t t0, t1;
-  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
-
-  t0 = (int32_t)a[1] * b_cached;
-  t0 += (int32_t)a[0] * b[0];
-  t1 = (int32_t)a[0] * b[1];
-  t1 += (int32_t)a[1] * b[0];
-
-  /* |ti| < 2 * q * 2^15 */
-  r[0] = montgomery_reduce(t0);
-  r[1] = montgomery_reduce(t1);
-
-  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
-}
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
-int empty_cu_ntt;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.h
deleted file mode 100644
index 4e80d3ab3..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/ntt.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef NTT_H
-#define NTT_H
-#include "common.h"
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "poly.h"
-#include "reduce.h"
-
-#define zetas MLKEM_NAMESPACE(zetas)
-extern const int16_t zetas[128];
-
-#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
-/*************************************************
- * Name:        poly_ntt
- *
- * Description: Computes negacyclic number-theoretic transform (NTT) of
- *              a polynomial in place.
- *
- *              The input is assumed to be in normal order and
- *              coefficient-wise bound by MLKEM_Q in absolute value.
- *
- *              The output polynomial is in bitreversed order, and
- *              coefficient-wise bound by NTT_BOUND in absolute value.
- *
- *              (NOTE: Sometimes the input to the NTT is actually smaller,
- *               which gives better bounds.)
- *
- * Arguments:   - poly *p: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
-);
-
-#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
-/*************************************************
- * Name:        poly_invntt_tomont
- *
- * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
- *              of a polynomial in place;
- *              inputs assumed to be in bitreversed order, output in normal
- *              order
- *
- *              The input is assumed to be in bitreversed order, and can
- *              have arbitrary coefficients in int16_t.
- *
- *              The output polynomial is in normal order, and
- *              coefficient-wise bound by INVNTT_BOUND in absolute value.
- *
- * Arguments:   - uint16_t *a: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
-);
-
-#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
-/************************************************************
- * Name: basemul_cached
- *
- * Description: Computes a representative modulo q of
- *              (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536
- *
- *              If b_cached is b1*zeta, this represents the
- *              product of (a0 + a1*X) and (b0 + b1*X) in
- *              Fq[X]/(X^2 - zeta).
- *
- * Arguments: - r: Pointer to output polynomial
- *                   Upon return, coefficients are bound by
- *                   2*MLKEM_Q in absolute value.
- *            - a: Pointer to first input polynomial
- *                   Every coefficient must be in [0..4095]
- *            - b: Pointer to second input polynomial
- *                   Can have arbitrary int16_t coefficients
- *            - b_cached: Some precomputed value, typically derived from
- *                   b1 and a twiddle factor. Can be an arbitary int16_t.
- ************************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-__contract__(
-  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
-  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
-  assigns(memory_slice(r, 2 * sizeof(int16_t)))
-  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
-);
-
-#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h
index 57ea4c8ba..7f6c12625 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/params.h
@@ -18,6 +18,7 @@
 #define MLKEM_N 256
 #define MLKEM_Q 3329
 #define UINT12_LIMIT 4096
+#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
 
 #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */
 #define MLKEM_SSBYTES 32  /* size in bytes of shared key */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c
index 7483ebf6d..e8a2e2c6e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.c
@@ -8,388 +8,246 @@
 #include <stdint.h>
 #include <string.h>
 #include "arith_backend.h"
-#include "cbd.h"
 #include "cbmc.h"
 #include "debug.h"
 #include "fips202x4.h"
-#include "ntt.h"
 #include "poly.h"
-#include "reduce.h"
+#include "sampling.h"
 #include "symmetric.h"
 #include "verify.h"
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j <= MLKEM_N / 4))
-  {
-    unsigned k;
-    uint16_t t[4];
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(forall(r, 0, k, t[r] < (1u << 10))))
-    {
-      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
-    }
-
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 10-bit in size.
-     */
-    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
-    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
-    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
-    r[5 * j + 4] = (t[3] >> 2);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
-  {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
+#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
+#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
+#define fqmul MLKEM_NAMESPACE(fqmul)
+#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
+#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
+#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
+#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
+#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
+#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        cast_uint16_to_int16
+ *
+ * Description: Cast uint16 value to int16
+ *
+ * Returns:
+ *   input x in     0 .. 32767: returns value unchanged
+ *   input x in 32768 .. 65535: returns (x - 65536)
+ **************************************************/
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+ALWAYS_INLINE
+static INLINE int16_t cast_uint16_to_int16(uint16_t x)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 4)
-    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[4];
-    uint8_t const *base = &a[5 * j];
-
-    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
-    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
-    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
-
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  /*
+   * PORTABILITY: This relies on uint16_t -> int16_t
+   * being implemented as the inverse of int16_t -> uint16_t,
+   * which is implementation-defined (C99 6.3.1.3 (3))
+   * CBMC (correctly) fails to prove this conversion is OK,
+   * so we have to suppress that check here
+   */
+  return (int16_t)x;
 }
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+/*************************************************
+ * Name:        montgomery_reduce_generic
+ *
+ * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
+ *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
+ *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
+ *
+ **************************************************/
+ALWAYS_INLINE
+static INLINE int16_t montgomery_reduce_generic(int32_t a)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
+  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 32)))
-    {
-      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
-    }
+  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
+  const uint16_t a_reduced = a & UINT16_MAX;
+  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
 
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC, and use array indexing into
-     * r rather than pointer-arithmetic to simplify verification
-     */
-    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
-    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
-    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
-    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
-    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
-  }
-}
+  /* Lift to signed canonical representative mod 2^16. */
+  const int16_t t = cast_uint16_to_int16(a_inverted);
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  int32_t r = a - ((int32_t)t * MLKEM_Q);
+  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
 
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j <= MLKEM_N / 8))
-  {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
-    {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
-    }
+  /*
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  r = r >> 16;
+  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
+   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
+   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *
+   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
+   */
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
-  }
+  return (int16_t)r;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+/*************************************************
+ * Name:        montgomery_reduce
+ *
+ * Description: Montgomery reduction
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q,
+ *              smaller than 2 * q in absolute value.
+ **************************************************/
+static INLINE int16_t montgomery_reduce(int32_t a)
+__contract__(
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
+  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
+)
 {
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    uint8_t t[8];
-    const unsigned offset = i * 5;
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC and unwind loop for ease
-     * of proof.
-     */
-
-    /*
-     * Decompress 5 8-bit bytes (so 40 bits) into
-     * 8 5-bit values stored in t[]
-     */
-    t[0] = 0x1F & (a[offset + 0] >> 0);
-    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
-    t[2] = 0x1F & (a[offset + 1] >> 2);
-    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
-    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
-    t[5] = 0x1F & (a[offset + 3] >> 1);
-    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
-    t[7] = 0x1F & (a[offset + 4] >> 3);
-
-    /* and copy to the correct slice in r[] */
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(j <= 8 && i <= MLKEM_N / 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  int16_t res;
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
+
+  res = montgomery_reduce_generic(a);
+  /* Bounds:
+   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
+   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
+   *        < 2 * MLKEM_Q */
+
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
+  return res;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+    !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        fqmul
+ *
+ * Description: Montgomery multiplication modulo q=3329
+ *
+ * Arguments:   - int16_t a: first factor
+ *                  Can be any int16_t.
+ *              - int16_t b: second factor.
+ *                  Must be signed canonical (abs value <(q+1)/2)
+ *
+ * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
+ * smaller than q in absolute value.
+ *
+ **************************************************/
+static INLINE int16_t fqmul(int16_t a, int16_t b)
+__contract__(
+  requires(b > -HALF_Q)
+  requires(b < HALF_Q)
+  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
+)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
-  }
+  int16_t res;
+  debug_assert_abs_bound(&b, 1, HALF_Q);
+
+  res = montgomery_reduce((int32_t)a * (int32_t)b);
+  /* Bounds:
+   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
+   *        < MLKEM_Q
+   */
 
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
+  return res;
 }
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
-
-#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+          !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+          !defined(MLKEM_USE_NATIVE_NTT) ||                   \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        barrett_reduce
+ *
+ * Description: Barrett reduction; given a 16-bit integer a, computes
+ *              centered representative congruent to a mod q in
+ *              {-(q-1)/2,...,(q-1)/2}
+ *
+ * Arguments:   - int16_t a: input integer to be reduced
+ *
+ * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
+ **************************************************/
+static INLINE int16_t barrett_reduce(int16_t a)
+__contract__(
+  ensures(return_value > -HALF_Q && return_value < HALF_Q)
+)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i <= MLKEM_N / 2))
-  {
-    const uint16_t t0 = a->coeffs[2 * i];
-    const uint16_t t1 = a->coeffs[2 * i + 1];
-    /*
-     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
-     * significant data, so these can be packed into 24 bits or exactly
-     * 3 bytes, as follows.
-     */
-
-    /* Least significant bits 0 - 7 of t0. */
-    r[3 * i + 0] = t0 & 0xFF;
-
-    /*
-     * Most significant bits 8 - 11 of t0 become the least significant
-     * nibble of the second byte. The least significant 4 bits
-     * of t1 become the upper nibble of the second byte.
-     */
-    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+  /*
+   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
+   * multiplier is round_to_nearest(2**26/MLKEM_Q)
+   */
+  const int BPOWER = 26;
+  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
 
-    /* Bits 4 - 11 of t1 become the third byte. */
-    r[3 * i + 2] = t1 >> 4;
-  }
-}
-#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-{
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  poly_tobytes_native(r, a);
-}
-#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+  /*
+   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
+   * above and shift by BPOWER places.
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
 
-#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
-  {
-    const uint8_t t0 = a[3 * i + 0];
-    const uint8_t t1 = a[3 * i + 1];
-    const uint8_t t2 = a[3 * i + 2];
-    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
-    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
-  }
+  /*
+   * t is in -10 .. +10, so we need 32-bit math to
+   * evaluate t * MLKEM_Q and the subsequent subtraction
+   */
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
 
-  /* Note that the coefficients are not canonical */
-  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
-}
-#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  poly_frombytes_native(r, a);
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
-#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
+                           int16_t b_cached)
+__contract__(
+  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
+  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
+  assigns(memory_slice(r, 2 * sizeof(int16_t)))
+  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)))
 {
-  unsigned i;
-#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
-#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
-#endif
+  int32_t t0, t1;
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <  MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      /* Prevent the compiler from recognizing this as a bit selection */
-      uint8_t mask = value_barrier_u8(1u << j);
-      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
-    }
-  }
-  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
-}
+  t0 = (int32_t)a[1] * b_cached;
+  t0 += (int32_t)a[0] * b[0];
+  t1 = (int32_t)a[0] * b[1];
+  t1 += (int32_t)a[1] * b[0];
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* |ti| < 2 * q * 2^15 */
+  r[0] = montgomery_reduce(t0);
+  r[1] = montgomery_reduce(t1);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    msg[i] = 0;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8))
-    {
-      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
-      msg[i] |= t << j;
-    }
-  }
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -434,12 +292,46 @@ void poly_tomont(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
-  poly_tomont_native(r);
+  poly_tomont_native(r->coeffs);
   debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
 #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/************************************************************
+ * Name: scalar_signed_to_unsigned_q
+ *
+ * Description: converts signed polynomial coefficient
+ *              from signed (-3328 .. 3328) form to
+ *              unsigned form (0 .. 3328).
+ *
+ * Note: Cryptographic constant time implementation
+ *
+ * Examples:       0 -> 0
+ *                 1 -> 1
+ *              3328 -> 3328
+ *                -1 -> 3328
+ *                -2 -> 3327
+ *             -3328 -> 1
+ *
+ * Arguments: c: signed coefficient to be converted
+ ************************************************************/
+static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
+__contract__(
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
+  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
+{
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
+  /* Add Q if c is negative, but in constant time */
+  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
+
+  /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
+  return (uint16_t)c;
+}
+
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
@@ -461,7 +353,7 @@ void poly_reduce(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
-  poly_reduce_native(r);
+  poly_reduce_native(r->coeffs);
   debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
@@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
-  poly_mulcache_compute_native(x, a);
+  poly_mulcache_compute_native(x->coeffs, a->coeffs);
   /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 
+#if !defined(MLKEM_USE_NATIVE_NTT)
+/*
+ * Computes a block CT butterflies with a fixed twiddle factor,
+ * using Montgomery multiplication.
+ * Parameters:
+ * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
+ * - root: Twiddle factor to use for the butterfly. This must be in
+ *         Montgomery form and signed canonical.
+ * - start: Offset to the beginning of the butterfly block
+ * - len: Index difference between coefficients subject to a butterfly
+ * - bound: Ghost variable describing coefficient bound: Prior to `start`,
+ *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
+ *          they must be bound by `bound`.
+ * When this function returns, output coefficients in the index range
+ * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
+ * Example:
+ * - start=8, len=4
+ *   This would compute the following four butterflies
+ *          8     --    12
+ *             9    --     13
+ *                10   --     14
+ *                   11   --     15
+ * - start=4, len=2
+ *   This would compute the following two butterflies
+ *          4 -- 6
+ *             5 -- 7
+ */
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
+__contract__(
+  requires(start < MLKEM_N)
+  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
+  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
+  requires(-HALF_Q < zeta && zeta < HALF_Q)
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
+  requires(array_abs_bound(r, start, MLKEM_N, bound))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
+  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
+{
+  /* `bound` is a ghost variable only needed in the CBMC specification */
+  unsigned j;
+  ((void)bound);
+  for (j = start; j < start + len; j++)
+  __loop__(
+    invariant(start <= j && j <= start + len)
+    /*
+     * Coefficients are updated in strided pairs, so the bounds for the
+     * intermediate states alternate twice between the old and new bound
+     */
+    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j,           start + len, bound))
+    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
+  {
+    int16_t t;
+    t = fqmul(r[j + len], zeta);
+    r[j + len] = r[j] - t;
+    r[j] = r[j] + t;
+  }
+}
+
+/*
+ *Compute one layer of forward NTT
+ * Parameters:
+ * - r: Pointer to base of polynomial
+ * - len: Stride of butterflies in this layer.
+ * - layer: Ghost variable indicating which layer is being applied.
+ *          Must match `len` via `len == MLKEM_N >> layer`.
+ * Note: `len` could be dropped and computed in the function, but
+ *   we are following the structure of the reference NTT from the
+ *   official Kyber implementation here, merely adding `layer` as
+ *   a ghost variable for the specifications.
+ */
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
+  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable only needed in the CBMC specification */
+  ((void)layer);
+  /* Twiddle factors for layer n start at index 2^(layer-1) */
+  k = MLKEM_N / (2 * len);
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
+    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
+  {
+    int16_t zeta = zetas[k++];
+    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
+  }
+}
+
+/*
+ * Compute full forward NTT
+ * NOTE: This particular implementation satisfies a much tighter
+ * bound on the output coefficients (5*q) than the contractual one (8*q),
+ * but this is not needed in the calling code. Should we change the
+ * base multiplication strategy to require smaller NTT output bounds,
+ * the proof may need strengthening.
+ */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  unsigned len, layer;
+  int16_t *r;
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  r = p->coeffs;
+
+  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
+  __loop__(
+    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
+    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
+  {
+    ntt_layer(r, len, layer);
+  }
+
+  /* Check the stronger bound */
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_NTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  ntt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if !defined(MLKEM_USE_NATIVE_INTT)
+
+/* Compute one layer of inverse NTT */
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
+  requires(len == (1 << (8 - layer)))
+  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable used only in the specification */
+  ((void)layer);
+  k = MLKEM_N / len - 1;
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+    invariant(start <= MLKEM_N && k <= 127)
+    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
+    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
+  {
+    unsigned j;
+    int16_t zeta = zetas[k--];
+    for (j = start; j < start + len; j++)
+    __loop__(
+      invariant(start <= j && j <= start + len)
+      invariant(start <= MLKEM_N && k <= 127)
+      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+    {
+      int16_t t = r[j];
+      r[j] = barrett_reduce(t + r[j + len]);
+      r[j + len] = r[j + len] - t;
+      r[j + len] = fqmul(r[j + len], zeta);
+    }
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  /*
+   * Scale input polynomial to account for Montgomery factor
+   * and NTT twist. This also brings coefficients down to
+   * absolute value < MLKEM_Q.
+   */
+  unsigned j, len, layer;
+  const int16_t f = 1441;
+  int16_t *r = p->coeffs;
+
+  for (j = 0; j < MLKEM_N; j++)
+  __loop__(
+    invariant(j <= MLKEM_N)
+    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
+  {
+    r[j] = fqmul(r[j], f);
+  }
+
+  /* Run the invNTT layers */
+  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
+  __loop__(
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+  {
+    invntt_layer(p->coeffs, len, layer);
+  }
+
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_INTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  intt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_INTT */
+
 #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h
index 6a14c785d..cb0d67c1a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly.h
@@ -9,7 +9,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "reduce.h"
+#include "debug.h"
 #include "verify.h"
 
 /* Absolute exclusive upper bound for the output of the inverse NTT */
@@ -18,6 +18,9 @@
 /* Absolute exclusive upper bound for the output of the forward NTT */
 #define NTT_BOUND (8 * MLKEM_Q)
 
+#define zetas MLKEM_NAMESPACE(zetas)
+extern const int16_t zetas[128];
+
 /*
  * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
  * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
@@ -38,520 +41,6 @@ typedef struct
   int16_t coeffs[MLKEM_N >> 1];
 } poly_mulcache;
 
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
-#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
-#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
-#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
-#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
-#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
-#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
-#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
-#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
-#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
-/* End of static namespacing */
-
-/************************************************************
- * Name: scalar_compress_d1
- *
- * Description: Computes round(u * 2 / q)
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 1.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d1(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 2)
-  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
-{
-  uint32_t d0 = u << 1;
-  d0 *= 645083;
-  d0 += 1u << 30;
-  d0 >>= 31;
-  return d0;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_compress_d4
- *
- * Description: Computes round(u * 16 / q) % 16
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d4(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 16)
-  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
-{
-  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
-  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d4
- *
- * Description: Computes round(u * q / 16)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d4(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 16)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 8) / 16; }
-
-/************************************************************
- * Name: scalar_compress_d5
- *
- * Description: Computes round(u * 32 / q) % 32
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d5(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 32)
-  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
-{
-  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
-  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d5
- *
- * Description: Computes round(u * q / 32)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 32
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d5(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 32)
-  ensures(return_value <= MLKEM_Q - 1)
-) { return ((u * MLKEM_Q) + 16) / 32; }
-
-/************************************************************
- * Name: scalar_compress_d10
- *
- * Description: Computes round(u * 2**10 / q) % 2**10
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d10(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 10))
-  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
-{
-  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x3FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d10
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d10(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 1024)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 512) / 1024; }
-
-/************************************************************
- * Name: scalar_compress_d11
- *
- * Description: Computes round(u * 2**11 / q) % 2**11
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 11.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d11(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 11))
-  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
-{
-  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x7FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d11
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d11(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 2048)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 1024) / 2048; }
-
-/************************************************************
- * Name: scalar_signed_to_unsigned_q
- *
- * Description: converts signed polynomial coefficient
- *              from signed (-3328 .. 3328) form to
- *              unsigned form (0 .. 3328).
- *
- * Note: Cryptographic constant time implementation
- *
- * Examples:       0 -> 0
- *                 1 -> 1
- *              3328 -> 3328
- *                -1 -> 3328
- *                -2 -> 3327
- *             -3328 -> 1
- *
- * Arguments: c: signed coefficient to be converted
- ************************************************************/
-static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
-__contract__(
-  requires(c > -MLKEM_Q && c < MLKEM_Q)
-  ensures(return_value >= 0 && return_value < MLKEM_Q)
-  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
-{
-  debug_assert_abs_bound(&c, 1, MLKEM_Q);
-
-  /* Add Q if c is negative, but in constant time */
-  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
-
-  /* and therefore cast to uint16_t is safe. */
-  debug_assert_bound(&c, 1, 0, MLKEM_Q);
-  return (uint16_t)c;
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
-    (MLKEM_K == 2 || MLKEM_K == 3)
-#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
-/*************************************************
- * Name:        poly_compress_d4
- *
- * Description: Compression (4 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
-
-#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
-/*************************************************
- * Name:        poly_compress_d10
- *
- * Description: Compression (10 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
-
-#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
-/*************************************************
- * Name:        poly_decompress_d4
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
-
-#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
-/*************************************************
- * Name:        poly_decompress_d10
- *
- * Description: De-serialization and subsequent decompression (10 bits) of a
- *              polynomial; approximate inverse of poly_compress_d10
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
-/*************************************************
- * Name:        poly_compress_d5
- *
- * Description: Compression (5 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
-
-#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
-/*************************************************
- * Name:        poly_compress_d11
- *
- * Description: Compression (11 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
-
-#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
-/*************************************************
- * Name:        poly_decompress_d5
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
-
-#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
-/*************************************************
- * Name:        poly_decompress_d11
- *
- * Description: De-serialization and subsequent decompression (11 bits) of a
- *              polynomial; approximate inverse of poly_compress_d11
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
-        */
-
-#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
-/*************************************************
- * Name:        poly_tobytes
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - a: const pointer to input polynomial,
- *                with each coefficient in the range [0,1,..,Q-1]
- *              OUTPUT
- *              - r: pointer to output byte array
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYBYTES))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
-
-
-#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
-/*************************************************
- * Name:        poly_frombytes
- *
- * Description: De-serialization of a polynomial.
- *
- * Arguments:   INPUT
- *              - a: pointer to input byte array
- *                   (of MLKEM_POLYBYTES bytes)
- *              OUTPUT
- *              - r: pointer to output polynomial, with
- *                   each coefficient unsigned and in the range
- *                   0 .. 4095
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
-);
-
-
-#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
-/*************************************************
- * Name:        poly_frommsg
- *
- * Description: Convert 32-byte message to polynomial
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *msg: pointer to input message
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
-
-#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
-/*************************************************
- * Name:        poly_tomsg
- *
- * Description: Convert polynomial to 32-byte message
- *
- * Arguments:   - uint8_t *msg: pointer to output message
- *              - const poly *r: pointer to input polynomial
- *                Coefficients must be unsigned canonical
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(msg))
-);
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -715,4 +204,56 @@ __contract__(
   assigns(object_whole(r))
 );
 
+#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
+/*************************************************
+ * Name:        poly_ntt
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ *              (NOTE: Sometimes the input to the NTT is actually smaller,
+ *               which gives better bounds.)
+ *
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
+);
+
+#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
+/*************************************************
+ * Name:        poly_invntt_tomont
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place;
+ *              inputs assumed to be in bitreversed order, output in normal
+ *              order
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
+);
+
 #endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly_k.c
new file mode 100644
index 000000000..c2d330ea9
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly_k.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "poly_k.h"
+#include <stdint.h>
+#include <string.h>
+#include "arith_backend.h"
+#include "compress.h"
+#include "sampling.h"
+#include "symmetric.h"
+
+#include "debug.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
+#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
+/* End of static namespacing */
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+{
+  unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
+  }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+{
+  unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
+  }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_ntt(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_ntt(&r->vec[i]);
+  }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_invntt_tomont(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_invntt_tomont(&r->vec[i]);
+  }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
+}
+
+#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  unsigned i;
+  poly t;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+
+  poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
+  for (i = 1; i < MLKEM_K; i++)
+  {
+    poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
+                                   &b_cache->vec[i]);
+    poly_add(r, &t);
+  }
+
+  /*
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
+   */
+  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
+}
+#else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+  /* Omitting bounds assertion for cache since native implementations may
+   * decide not to use a mulcache. Note that the C backend implementation
+   * of poly_basemul_montgomery_cached() does still include the check. */
+  polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a,
+                                               (const int16_t *)b,
+                                               (const int16_t *)b_cache);
+}
+#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
+{
+  polyvec_mulcache b_cache;
+  polyvec_mulcache_compute(&b_cache, b);
+  polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_mulcache_compute(&x->vec[i], &a->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_reduce(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_reduce(&r->vec[i]);
+  }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_add(polyvec *r, const polyvec *b)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_add(&r->vec[i], &b->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tomont(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tomont(&r->vec[i]);
+  }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+}
+
+
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta1(poly *r,
+                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
+)
+{
+#if MLKEM_ETA1 == 2
+  poly_cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  poly_cbd3(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA1"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
+  memcpy(extkey0, seed, MLKEM_SYMBYTES);
+  memcpy(extkey1, seed, MLKEM_SYMBYTES);
+  memcpy(extkey2, seed, MLKEM_SYMBYTES);
+  memcpy(extkey3, seed, MLKEM_SYMBYTES);
+  extkey0[MLKEM_SYMBYTES] = nonce0;
+  extkey1[MLKEM_SYMBYTES] = nonce1;
+  extkey2[MLKEM_SYMBYTES] = nonce2;
+  extkey3[MLKEM_SYMBYTES] = nonce3;
+  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
+  poly_cbd_eta1(r0, buf0);
+  poly_cbd_eta1(r1, buf1);
+  poly_cbd_eta1(r2, buf2);
+  poly_cbd_eta1(r3, buf3);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+}
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+/*************************************************
+ * Name:        poly_cbd_eta2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta2(poly *r,
+                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
+{
+#if MLKEM_ETA2 == 2
+  poly_cbd2(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA2"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, seed, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+  prf_eta2(buf, extkey);
+
+  poly_cbd_eta2(r, buf);
+
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+}
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+
+#if MLKEM_K == 2
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+  prf_eta1(buf1[0], extkey[0]);
+  prf_eta1(buf1[1], extkey[1]);
+  prf_eta2(buf2[0], extkey[2]);
+  prf_eta2(buf2[1], extkey[3]);
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
+}
+#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly_k.h
new file mode 100644
index 000000000..0aea95912
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/poly_k.h
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef POLY_K_H
+#define POLY_K_H
+
+#include <stdint.h>
+#include "common.h"
+#include "compress.h"
+#include "poly.h"
+
+#define polyvec MLKEM_NAMESPACE_K(polyvec)
+typedef struct
+{
+  poly vec[MLKEM_K];
+} ALIGN polyvec;
+
+#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
+typedef struct
+{
+  poly_mulcache vec[MLKEM_K];
+} polyvec_mulcache;
+
+#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
+{
+#if MLKEM_DU == 10
+  poly_compress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_compress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *              polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_du(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DU == 10
+  poly_decompress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_decompress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r)))
+{
+#if MLKEM_DV == 4
+  poly_compress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_compress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+
+#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_dv(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DV == 4
+  poly_decompress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_decompress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
+/*************************************************
+ * Name:        polyvec_compress_du
+ *
+ * Description: Compress and serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ *              - const polyvec *a: pointer to input vector of polynomials.
+ *                                  Coefficients must be unsigned canonical,
+ *                                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(forall(k0, 0, MLKEM_K,
+         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
+/*************************************************
+ * Name:        polyvec_decompress_du
+ *
+ * Description: De-serialize and decompress vector of polynomials;
+ *              approximate inverse of polyvec_compress_du
+ *
+ * Arguments:   - polyvec *r:       pointer to output vector of polynomials.
+ *                Output will have coefficients normalized to [0,..,q-1].
+ *              - const uint8_t *a: pointer to input byte array
+ *                                  (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(k0, 0, MLKEM_K,
+         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+);
+
+#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
+/*************************************************
+ * Name:        polyvec_tobytes
+ *
+ * Description: Serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECBYTES)
+ *              - const polyvec *a: pointer to input vector of polynomials
+ *                  Each polynomial must have coefficients in [0,..,q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+__contract__(
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
+  requires(forall(k0, 0, MLKEM_K,
+         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
+/*************************************************
+ * Name:        polyvec_frombytes
+ *
+ * Description: De-serialize vector of polynomials;
+ *              inverse of polyvec_tobytes
+ *
+ * Arguments:   - const polyvec *a: pointer to output vector of polynomials
+ *                 (of length MLKEM_POLYVECBYTES). Output will have coefficients
+ *                 normalized in [0..4095].
+ *              - uint8_t *r: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
+  assigns(object_whole(r))
+  ensures(forall(k0, 0, MLKEM_K,
+        array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
+);
+
+#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
+/*************************************************
+ * Name:        polyvec_ntt
+ *
+ * Description: Apply forward NTT to all elements of a vector of polynomials.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_ntt(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+  assigns(object_whole(r))
+  ensures(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
+);
+
+#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
+/*************************************************
+ * Name:        polyvec_invntt_tomont
+ *
+ * Description: Apply inverse NTT to all elements of a vector of polynomials
+ *              and multiply by Montgomery factor 2^16
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_invntt_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND)))
+);
+
+#define polyvec_basemul_acc_montgomery \
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery
+ *
+ * Description: Multiply elements of a and b in NTT domain, accumulate into r,
+ *              and multiply by 2^-16.
+ *
+ * Arguments: - poly *r: pointer to output polynomial
+ *            - const polyvec *a: pointer to first input vector of polynomials
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(forall(k1, 0, MLKEM_K,
+    array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+
+#define polyvec_basemul_acc_montgomery_cached \
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery_cached
+ *
+ * Description: Scalar product of two vectors of polynomials in NTT domain,
+ *              using mulcache for second operand.
+ *
+ *              Bounds:
+ *              - Every coefficient of a is assumed to be in [0..4095]
+ *              - No bounds guarantees for the coefficients in the result.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const polyvec *a: pointer to first input polynomial vector
+ *              - const polyvec *b: pointer to second input polynomial vector
+ *              - const polyvec_mulcache *b_cache: pointer to mulcache
+ *                  for second input polynomial vector. Can be computed
+ *                  via polyvec_mulcache_compute().
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache)))
+  requires(forall(k1, 0, MLKEM_K,
+     array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
+/************************************************************
+ * Name: polyvec_mulcache_compute
+ *
+ * Description: Computes the mulcache for a vector of polynomials in NTT domain
+ *
+ *              The mulcache of a degree-2 polynomial b := b0 + b1*X
+ *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
+ *              computing products of b in Fq[X]/(X^2-zeta).
+ *
+ *              The mulcache of a polynomial in NTT domain -- which is
+ *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
+ *              for varying zeta, is the 128-tuple of mulcaches of those
+ *              polynomials.
+ *
+ *              The mulcache of a vector of polynomials is the vector
+ *              of mulcaches of its entries.
+ *
+ * Arguments: - x: Pointer to mulcache to be populated
+ *            - a: Pointer to input polynomial vector
+ ************************************************************/
+/*
+ * NOTE: The default C implementation of this function populates
+ * the mulcache with values in (-q,q), but this is not needed for the
+ * higher level safety proofs, and thus not part of the spec.
+ */
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+__contract__(
+  requires(memory_no_alias(x, sizeof(polyvec_mulcache)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  assigns(object_whole(x))
+);
+
+#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
+/*************************************************
+ * Name:        polyvec_reduce
+ *
+ * Description: Applies Barrett reduction to each coefficient
+ *              of each element of a vector of polynomials;
+ *              for details of the Barrett reduction see comments in reduce.c
+ *
+ * Arguments:   - polyvec *r: pointer to input/output polynomial
+ **************************************************/
+/*
+ * NOTE: The semantics of polyvec_reduce() is different in
+ *       the reference implementation, which requires
+ *       signed canonical output data. Unsigned canonical
+ *       outputs are better suited to the only remaining
+ *       use of poly_reduce() in the context of (de)serialization.
+ */
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_reduce(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(k0, 0, MLKEM_K,
+    array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+);
+
+#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
+/*************************************************
+ * Name:        polyvec_add
+ *
+ * Description: Add vectors of polynomials
+ *
+ * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be
+ *              added to
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ *
+ * The coefficients of r and b must be so that the addition does
+ * not overflow. Otherwise, the behaviour of this function is undefined.
+ *
+ * The coefficients returned in *r are in int16_t which is sufficient
+ * to prove type-safety of calling units. Therefore, no stronger
+ * ensures clause is required on this function.
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_add(polyvec *r, const polyvec *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(forall(j0, 0, MLKEM_K,
+          forall(k0, 0, MLKEM_N,
+            (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
+  requires(forall(j1, 0, MLKEM_K,
+          forall(k1, 0, MLKEM_N,
+            (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
+/*************************************************
+ * Name:        polyvec_tomont
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              vector from normal domain to Montgomery domain
+ *
+ *              Bounds: Output < q in absolute value.
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(memory_slice(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(j, 0, MLKEM_K,
+    array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+);
+
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+);
+#endif /* MLKEM_K == 2 */
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.c
deleted file mode 100644
index 50ea1c34a..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.c
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "polyvec.h"
-#include <stdint.h>
-#include <string.h>
-#include "arith_backend.h"
-#include "cbd.h"
-#include "ntt.h"
-#include "poly.h"
-#include "symmetric.h"
-
-#include "debug.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
-#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
-/* End of static namespacing */
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
-                         const polyvec *a)
-{
-  unsigned i;
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_decompress_du(polyvec *r,
-                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
-  }
-
-  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
-{
-  unsigned i;
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
-  }
-
-  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_ntt(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_ntt(&r->vec[i]);
-  }
-
-  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_invntt_tomont(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_invntt_tomont(&r->vec[i]);
-  }
-
-  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
-}
-
-#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
-                                           const polyvec *b,
-                                           const polyvec_mulcache *b_cache)
-{
-  unsigned i;
-  poly t;
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
-
-  poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
-  for (i = 1; i < MLKEM_K; i++)
-  {
-    poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
-                                   &b_cache->vec[i]);
-    poly_add(r, &t);
-  }
-
-  /*
-   * This bound is true for the C implementation, but not needed
-   * in the higher level bounds reasoning. It is thus omitted
-   * them from the spec to not unnecessarily constrain native
-   * implementations, but checked here nonetheless.
-   */
-  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
-}
-#else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
-                                           const polyvec *b,
-                                           const polyvec_mulcache *b_cache)
-{
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
-  /* Omitting bounds assertion for cache since native implementations may
-   * decide not to use a mulcache. Note that the C backend implementation
-   * of poly_basemul_montgomery_cached() does still include the check. */
-  polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
-}
-#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-{
-  polyvec_mulcache b_cache;
-  polyvec_mulcache_compute(&b_cache, b);
-  polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_mulcache_compute(&x->vec[i], &a->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_reduce(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_reduce(&r->vec[i]);
-  }
-
-  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_add(polyvec *r, const polyvec *b)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_add(&r->vec[i], &b->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tomont(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_tomont(&r->vec[i]);
-  }
-
-  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
-}
-
-
-/*************************************************
- * Name:        poly_cbd_eta1
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static INLINE void poly_cbd_eta1(poly *r,
-                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
-)
-{
-#if MLKEM_ETA1 == 2
-  poly_cbd2(r, buf);
-#elif MLKEM_ETA1 == 3
-  poly_cbd3(r, buf);
-#else
-#error "Invalid value of MLKEM_ETA1"
-#endif
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-{
-  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
-  memcpy(extkey0, seed, MLKEM_SYMBYTES);
-  memcpy(extkey1, seed, MLKEM_SYMBYTES);
-  memcpy(extkey2, seed, MLKEM_SYMBYTES);
-  memcpy(extkey3, seed, MLKEM_SYMBYTES);
-  extkey0[MLKEM_SYMBYTES] = nonce0;
-  extkey1[MLKEM_SYMBYTES] = nonce1;
-  extkey2[MLKEM_SYMBYTES] = nonce2;
-  extkey3[MLKEM_SYMBYTES] = nonce3;
-  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
-  poly_cbd_eta1(r0, buf0);
-  poly_cbd_eta1(r1, buf1);
-  poly_cbd_eta1(r2, buf2);
-  poly_cbd_eta1(r3, buf3);
-
-  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
-}
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-/*************************************************
- * Name:        poly_cbd_eta2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA2.
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static INLINE void poly_cbd_eta2(poly *r,
-                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
-{
-#if MLKEM_ETA2 == 2
-  poly_cbd2(r, buf);
-#else
-#error "Invalid value of MLKEM_ETA2"
-#endif
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-{
-  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
-
-  memcpy(extkey, seed, MLKEM_SYMBYTES);
-  extkey[MLKEM_SYMBYTES] = nonce;
-  prf_eta2(buf, extkey);
-
-  poly_cbd_eta2(r, buf);
-
-  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-
-#if MLKEM_K == 2
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
-  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
-  extkey[0][MLKEM_SYMBYTES] = nonce0;
-  extkey[1][MLKEM_SYMBYTES] = nonce1;
-  extkey[2][MLKEM_SYMBYTES] = nonce2;
-  extkey[3][MLKEM_SYMBYTES] = nonce3;
-
-  prf_eta1(buf1[0], extkey[0]);
-  prf_eta1(buf1[1], extkey[1]);
-  prf_eta2(buf2[0], extkey[2]);
-  prf_eta2(buf2[1], extkey[3]);
-
-  poly_cbd_eta1(r0, buf1[0]);
-  poly_cbd_eta1(r1, buf1[1]);
-  poly_cbd_eta2(r2, buf2[0]);
-  poly_cbd_eta2(r3, buf2[1]);
-
-  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
-  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
-}
-#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.h
deleted file mode 100644
index 8be8579e0..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/polyvec.h
+++ /dev/null
@@ -1,595 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef POLYVEC_H
-#define POLYVEC_H
-
-#include <stdint.h>
-#include "common.h"
-#include "poly.h"
-
-#define polyvec MLKEM_NAMESPACE_K(polyvec)
-typedef struct
-{
-  poly vec[MLKEM_K];
-} ALIGN polyvec;
-
-#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
-typedef struct
-{
-  poly_mulcache vec[MLKEM_K];
-} polyvec_mulcache;
-
-#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
-/*************************************************
- * Name:        poly_compress_du
- *
- * Description: Compression (du bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
-                                    const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
-{
-#if MLKEM_DU == 10
-  poly_compress_d10(r, a);
-#elif MLKEM_DU == 11
-  poly_compress_d11(r, a);
-#else
-#error "Invalid value of MLKEM_DU"
-#endif
-}
-
-#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
-/*************************************************
- * Name:        poly_decompress_du
- *
- * Description: De-serialization and subsequent decompression (du bits) of a
- *              polynomial; approximate inverse of poly_compress_du
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-static INLINE void poly_decompress_du(
-    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-#if MLKEM_DU == 10
-  poly_decompress_d10(r, a);
-#elif MLKEM_DU == 11
-  poly_decompress_d11(r, a);
-#else
-#error "Invalid value of MLKEM_DU"
-#endif
-}
-
-#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
-/*************************************************
- * Name:        poly_compress_dv
- *
- * Description: Compression (dv bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
-                                    const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r)))
-{
-#if MLKEM_DV == 4
-  poly_compress_d4(r, a);
-#elif MLKEM_DV == 5
-  poly_compress_d5(r, a);
-#else
-#error "Invalid value of MLKEM_DV"
-#endif
-}
-
-
-#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
-/*************************************************
- * Name:        poly_decompress_dv
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-static INLINE void poly_decompress_dv(
-    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-#if MLKEM_DV == 4
-  poly_decompress_d4(r, a);
-#elif MLKEM_DV == 5
-  poly_decompress_d5(r, a);
-#else
-#error "Invalid value of MLKEM_DV"
-#endif
-}
-
-#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
-/*************************************************
- * Name:        polyvec_compress_du
- *
- * Description: Compress and serialize vector of polynomials
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU)
- *              - const polyvec *a: pointer to input vector of polynomials.
- *                                  Coefficients must be unsigned canonical,
- *                                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
-                         const polyvec *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(forall(k0, 0, MLKEM_K,
-         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-  assigns(object_whole(r))
-);
-
-#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
-/*************************************************
- * Name:        polyvec_decompress_du
- *
- * Description: De-serialize and decompress vector of polynomials;
- *              approximate inverse of polyvec_compress_du
- *
- * Arguments:   - polyvec *r:       pointer to output vector of polynomials.
- *                Output will have coefficients normalized to [0,..,q-1].
- *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU)
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_decompress_du(polyvec *r,
-                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(k0, 0, MLKEM_K,
-         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-);
-
-#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
-/*************************************************
- * Name:        polyvec_tobytes
- *
- * Description: Serialize vector of polynomials
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (needs space for MLKEM_POLYVECBYTES)
- *              - const polyvec *a: pointer to input vector of polynomials
- *                  Each polynomial must have coefficients in [0,..,q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
-__contract__(
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
-  requires(forall(k0, 0, MLKEM_K,
-         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-  assigns(object_whole(r))
-);
-
-#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
-/*************************************************
- * Name:        polyvec_frombytes
- *
- * Description: De-serialize vector of polynomials;
- *              inverse of polyvec_tobytes
- *
- * Arguments:   - const polyvec *a: pointer to output vector of polynomials
- *                 (of length MLKEM_POLYVECBYTES). Output will have coefficients
- *                 normalized in [0..4095].
- *              - uint8_t *r: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
-  assigns(object_whole(r))
-  ensures(forall(k0, 0, MLKEM_K,
-        array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
-);
-
-#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
-/*************************************************
- * Name:        polyvec_ntt
- *
- * Description: Apply forward NTT to all elements of a vector of polynomials.
- *
- *              The input is assumed to be in normal order and
- *              coefficient-wise bound by MLKEM_Q in absolute value.
- *
- *              The output polynomial is in bitreversed order, and
- *              coefficient-wise bound by NTT_BOUND in absolute value.
- *
- * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_ntt(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  requires(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
-  assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
-);
-
-#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
-/*************************************************
- * Name:        polyvec_invntt_tomont
- *
- * Description: Apply inverse NTT to all elements of a vector of polynomials
- *              and multiply by Montgomery factor 2^16
- *
- *              The input is assumed to be in bitreversed order, and can
- *              have arbitrary coefficients in int16_t.
- *
- *              The output polynomial is in normal order, and
- *              coefficient-wise bound by INVNTT_BOUND in absolute value.
- *
- *
- * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_invntt_tomont(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND)))
-);
-
-#define polyvec_basemul_acc_montgomery \
-  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
-/*************************************************
- * Name:        polyvec_basemul_acc_montgomery
- *
- * Description: Multiply elements of a and b in NTT domain, accumulate into r,
- *              and multiply by 2^-16.
- *
- * Arguments: - poly *r: pointer to output polynomial
- *            - const polyvec *a: pointer to first input vector of polynomials
- *            - const polyvec *b: pointer to second input vector of polynomials
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(memory_no_alias(b, sizeof(polyvec)))
-  requires(forall(k1, 0, MLKEM_K,
-    array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
-  assigns(memory_slice(r, sizeof(poly)))
-);
-
-
-#define polyvec_basemul_acc_montgomery_cached \
-  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
-/*************************************************
- * Name:        polyvec_basemul_acc_montgomery_cached
- *
- * Description: Scalar product of two vectors of polynomials in NTT domain,
- *              using mulcache for second operand.
- *
- *              Bounds:
- *              - Every coefficient of a is assumed to be in [0..4095]
- *              - No bounds guarantees for the coefficients in the result.
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const polyvec *a: pointer to first input polynomial vector
- *              - const polyvec *b: pointer to second input polynomial vector
- *              - const polyvec_mulcache *b_cache: pointer to mulcache
- *                  for second input polynomial vector. Can be computed
- *                  via polyvec_mulcache_compute().
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
-                                           const polyvec *b,
-                                           const polyvec_mulcache *b_cache)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(memory_no_alias(b, sizeof(polyvec)))
-  requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache)))
-  requires(forall(k1, 0, MLKEM_K,
-     array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
-  assigns(memory_slice(r, sizeof(poly)))
-);
-
-#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
-/************************************************************
- * Name: polyvec_mulcache_compute
- *
- * Description: Computes the mulcache for a vector of polynomials in NTT domain
- *
- *              The mulcache of a degree-2 polynomial b := b0 + b1*X
- *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
- *              computing products of b in Fq[X]/(X^2-zeta).
- *
- *              The mulcache of a polynomial in NTT domain -- which is
- *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
- *              for varying zeta, is the 128-tuple of mulcaches of those
- *              polynomials.
- *
- *              The mulcache of a vector of polynomials is the vector
- *              of mulcaches of its entries.
- *
- * Arguments: - x: Pointer to mulcache to be populated
- *            - a: Pointer to input polynomial vector
- ************************************************************/
-/*
- * NOTE: The default C implementation of this function populates
- * the mulcache with values in (-q,q), but this is not needed for the
- * higher level safety proofs, and thus not part of the spec.
- */
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
-__contract__(
-  requires(memory_no_alias(x, sizeof(polyvec_mulcache)))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  assigns(object_whole(x))
-);
-
-#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
-/*************************************************
- * Name:        polyvec_reduce
- *
- * Description: Applies Barrett reduction to each coefficient
- *              of each element of a vector of polynomials;
- *              for details of the Barrett reduction see comments in reduce.c
- *
- * Arguments:   - polyvec *r: pointer to input/output polynomial
- **************************************************/
-/*
- * NOTE: The semantics of polyvec_reduce() is different in
- *       the reference implementation, which requires
- *       signed canonical output data. Unsigned canonical
- *       outputs are better suited to the only remaining
- *       use of poly_reduce() in the context of (de)serialization.
- */
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_reduce(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(k0, 0, MLKEM_K,
-    array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-);
-
-#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
-/*************************************************
- * Name:        polyvec_add
- *
- * Description: Add vectors of polynomials
- *
- * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be
- *              added to
- *            - const polyvec *b: pointer to second input vector of polynomials
- *
- * The coefficients of r and b must be so that the addition does
- * not overflow. Otherwise, the behaviour of this function is undefined.
- *
- * The coefficients returned in *r are in int16_t which is sufficient
- * to prove type-safety of calling units. Therefore, no stronger
- * ensures clause is required on this function.
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_add(polyvec *r, const polyvec *b)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  requires(memory_no_alias(b, sizeof(polyvec)))
-  requires(forall(j0, 0, MLKEM_K,
-          forall(k0, 0, MLKEM_N,
-            (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
-  requires(forall(j1, 0, MLKEM_K,
-          forall(k1, 0, MLKEM_N,
-            (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
-  assigns(object_whole(r))
-);
-
-#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
-/*************************************************
- * Name:        polyvec_tomont
- *
- * Description: Inplace conversion of all coefficients of a polynomial
- *              vector from normal domain to Montgomery domain
- *
- *              Bounds: Output < q in absolute value.
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tomont(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(memory_slice(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-    array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
-);
-
-#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and nonces, with output polynomials close to centered binomial distribution
- * with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-/* Depending on MLKEM_K, the pointers passed to this function belong
-   to the same objects, so we cannot use memory_no_alias for r0-r3.
-
-   NOTE: Somehow it is important to use memory_no_alias() first in the
-         conjunctions defining each case.
-*/
-#if MLKEM_K == 2
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
-    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 4
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case B: r0, r1, r2, r3 consecutive */
-    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 3
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case C: r0, r1, r2 consecutive */
- (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
-  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#endif /* MLKEM_K */
-
-#if MLKEM_ETA1 == MLKEM_ETA2
-/*
- * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
- * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
- * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
- */
-#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
-#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
-/*************************************************
- * Name:        poly_getnoise_eta2
- *
- * Description: Sample a polynomial deterministically from a seed and a nonce,
- *              with output polynomial close to centered binomial distribution
- *              with parameter MLKEM_ETA2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1122_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and a nonces, with output polynomials close to centered binomial
- * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-__contract__(
-  requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
-  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
-     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
-);
-#endif /* MLKEM_K == 2 */
-
-#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.h
deleted file mode 100644
index b432a4201..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/reduce.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "common.h"
-#include "debug.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
-#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
-#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
-#define fqmul MLKEM_NAMESPACE(fqmul)
-#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
-/* End of static namespacing */
-
-#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
-
-/*************************************************
- * Name:        cast_uint16_to_int16
- *
- * Description: Cast uint16 value to int16
- *
- * Returns:
- *   input x in     0 .. 32767: returns value unchanged
- *   input x in 32768 .. 65535: returns (x - 65536)
- **************************************************/
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
-ALWAYS_INLINE
-static INLINE int16_t cast_uint16_to_int16(uint16_t x)
-{
-  /*
-   * PORTABILITY: This relies on uint16_t -> int16_t
-   * being implemented as the inverse of int16_t -> uint16_t,
-   * which is implementation-defined (C99 6.3.1.3 (3))
-   * CBMC (correctly) fails to prove this conversion is OK,
-   * so we have to suppress that check here
-   */
-  return (int16_t)x;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/*************************************************
- * Name:        montgomery_reduce_generic
- *
- * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
- *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *
- * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
- *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
- *
- **************************************************/
-ALWAYS_INLINE
-static INLINE int16_t montgomery_reduce_generic(int32_t a)
-{
-  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
-  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
-
-  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
-  const uint16_t a_reduced = a & UINT16_MAX;
-  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
-
-  /* Lift to signed canonical representative mod 2^16. */
-  const int16_t t = cast_uint16_to_int16(a_inverted);
-
-  int32_t r = a - ((int32_t)t * MLKEM_Q);
-  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
-
-  /*
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  r = r >> 16;
-  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
-   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
-   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *
-   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
-   */
-
-  return (int16_t)r;
-}
-
-/*************************************************
- * Name:        montgomery_reduce
- *
- * Description: Montgomery reduction
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
- *
- * Returns:     integer congruent to a * R^-1 modulo q,
- *              smaller than 2 * q in absolute value.
- **************************************************/
-static INLINE int16_t montgomery_reduce(int32_t a)
-__contract__(
-  requires(a > -(2 * UINT12_LIMIT * 32768))
-  requires(a <  (2 * UINT12_LIMIT * 32768))
-  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
-
-  res = montgomery_reduce_generic(a);
-  /* Bounds:
-   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
-   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
-   *        < 2 * MLKEM_Q */
-
-  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        fqmul
- *
- * Description: Montgomery multiplication modulo q=3329
- *
- * Arguments:   - int16_t a: first factor
- *                  Can be any int16_t.
- *              - int16_t b: second factor.
- *                  Must be signed canonical (abs value <(q+1)/2)
- *
- * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
- * smaller than q in absolute value.
- *
- **************************************************/
-static INLINE int16_t fqmul(int16_t a, int16_t b)
-__contract__(
-  requires(b > -HALF_Q)
-  requires(b < HALF_Q)
-  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&b, 1, HALF_Q);
-
-  res = montgomery_reduce((int32_t)a * (int32_t)b);
-  /* Bounds:
-   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
-   *        < MLKEM_Q
-   */
-
-  debug_assert_abs_bound(&res, 1, MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        barrett_reduce
- *
- * Description: Barrett reduction; given a 16-bit integer a, computes
- *              centered representative congruent to a mod q in
- *              {-(q-1)/2,...,(q-1)/2}
- *
- * Arguments:   - int16_t a: input integer to be reduced
- *
- * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
- **************************************************/
-static INLINE int16_t barrett_reduce(int16_t a)
-__contract__(
-  ensures(return_value > -HALF_Q && return_value < HALF_Q)
-)
-{
-  /*
-   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
-   * multiplier is round_to_nearest(2**26/MLKEM_Q)
-   */
-  const int BPOWER = 26;
-  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
-
-  /*
-   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
-   * above and shift by BPOWER places.
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
-
-  /*
-   * t is in -10 .. +10, so we need 32-bit math to
-   * evaluate t * MLKEM_Q and the subsequent subtraction
-   */
-  int16_t res = (int16_t)(a - t * MLKEM_Q);
-
-  debug_assert_abs_bound(&res, 1, HALF_Q);
-  return res;
-}
-
-#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.c
deleted file mode 100644
index cbbe4407f..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-
-#include "arith_backend.h"
-#include "debug.h"
-#include "fips202.h"
-#include "fips202x4.h"
-#include "rej_uniform.h"
-#include "symmetric.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
-#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
-/* End of static namespacing */
-
-static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
-                                       unsigned int offset, const uint8_t *buf,
-                                       unsigned int buflen)
-__contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
-{
-  unsigned int ctr, pos;
-  uint16_t val0, val1;
-
-  debug_assert_bound(r, offset, 0, MLKEM_Q);
-
-  ctr = offset;
-  pos = 0;
-  /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
-  while (ctr < target && pos + 3 <= buflen)
-  __loop__(
-    invariant(offset <= ctr && ctr <= target && pos <= buflen)
-    invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q)))
-  {
-    val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
-    val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
-    pos += 3;
-
-    if (val0 < MLKEM_Q)
-    {
-      r[ctr++] = val0;
-    }
-    if (ctr < target && val1 < MLKEM_Q)
-    {
-      r[ctr++] = val1;
-    }
-  }
-
-  debug_assert_bound(r, ctr, 0, MLKEM_Q);
-  return ctr;
-}
-
-#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-/*************************************************
- * Name:        rej_uniform
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
- *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
- **************************************************/
-
-/*
- * NOTE: The signature differs from the Kyber reference implementation
- * in that it adds the offset and always expects the base of the target
- * buffer. This avoids shifting the buffer base in the caller, which appears
- * tricky to reason about.
- */
-static unsigned int rej_uniform(int16_t *r, unsigned int target,
-                                unsigned int offset, const uint8_t *buf,
-                                unsigned int buflen)
-__contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
-{
-  return rej_uniform_scalar(r, target, offset, buf, buflen);
-}
-#else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-static unsigned int rej_uniform(int16_t *r, unsigned int target,
-                                unsigned int offset, const uint8_t *buf,
-                                unsigned int buflen)
-{
-  int ret;
-
-  /* Sample from large buffer with full lane as much as possible. */
-  ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
-  if (ret != -1)
-  {
-    unsigned res = offset + (unsigned)ret;
-    debug_assert_bound(r, res, 0, MLKEM_Q);
-    return res;
-  }
-
-  return rej_uniform_scalar(r, target, offset, buf, buflen);
-}
-#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
-  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
-#endif
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
-{
-  /* Temporary buffers for XOF output before rejection sampling */
-  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-
-  /* Tracks the number of coefficients we have already sampled */
-  unsigned int ctr[KECCAK_WAY];
-  xof_x4_ctx statex;
-  unsigned int buflen;
-
-  shake128x4_inc_init(&statex);
-
-  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
-  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
-                MLKEM_SYMBYTES + 2);
-
-  /*
-   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   * This should generate the matrix entries with high probability.
-   */
-  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
-                       &statex);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
-  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
-  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
-  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
-
-  /*
-   * So long as not all matrix entries have been generated, squeeze
-   * one more block a time until we're done.
-   */
-  buflen = XOF_RATE;
-  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
-         ctr[3] < MLKEM_N)
-  __loop__(
-    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
-       object_whole(buf1), object_whole(buf2), object_whole(buf3))
-    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
-    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
-    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
-    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
-    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
-    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
-  {
-    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
-    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
-    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
-    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
-    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
-  }
-
-  xof_x4_release(&statex);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-{
-  xof_ctx state;
-  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  unsigned int ctr, buflen;
-
-  shake128_inc_init(&state);
-
-  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
-
-  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   */
-  /* This should generate the matrix entry with high probability. */
-  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
-
-  /* Squeeze + sample one more block a time until we're done */
-  buflen = XOF_RATE;
-  while (ctr < MLKEM_N)
-  __loop__(
-    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
-    invariant(ctr <= MLKEM_N)
-    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
-  {
-    xof_squeezeblocks(buf, 1, &state);
-    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
-  }
-
-  xof_release(&state);
-}
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
-int empty_cu_rej_uniform;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.h
deleted file mode 100644
index 801287259..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/rej_uniform.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef REJ_UNIFORM_H
-#define REJ_UNIFORM_H
-
-#include <stdint.h>
-#include <stdlib.h>
-#include "cbmc.h"
-#include "common.h"
-#include "poly.h"
-
-#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
-/*************************************************
- * Name:        poly_rej_uniform_x4
- *
- * Description: Generate four polynomials using rejection sampling
- *              on (pseudo-)uniformly random bytes sampled from a seed.
- *
- * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
- *                                     to be sampled.
- *              - uint8_t *seed[4]:    Pointer to array of four pointers
- *                                     pointing to the seed buffers of size
- *                                     MLKEM_SYMBYTES + 2 each.
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
-__contract__(
-  requires(memory_no_alias(vec, sizeof(poly) * 4))
-  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
-  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(vec, sizeof(poly) * 4))
-  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
-
-#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
-/*************************************************
- * Name:        poly_rej_uniform
- *
- * Description: Generate polynomial using rejection sampling
- *              on (pseudo-)uniformly random bytes sampled from a seed.
- *
- * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
- *              - uint8_t *seed:       Pointer to seed buffer of size
- *                                     MLKEM_SYMBYTES + 2 each.
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-__contract__(
-  requires(memory_no_alias(entry, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(entry, sizeof(poly)))
-  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
-
-#endif /* REJ_UNIFORM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/sampling.c
new file mode 100644
index 000000000..98cbdcb74
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/sampling.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
+#include "arith_backend.h"
+#include "debug.h"
+#include "fips202.h"
+#include "fips202x4.h"
+#include "sampling.h"
+#include "symmetric.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
+/* End of static namespacing */
+
+static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
+                                       unsigned int offset, const uint8_t *buf,
+                                       unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
+{
+  unsigned int ctr, pos;
+  uint16_t val0, val1;
+
+  debug_assert_bound(r, offset, 0, MLKEM_Q);
+
+  ctr = offset;
+  pos = 0;
+  /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
+  while (ctr < target && pos + 3 <= buflen)
+  __loop__(
+    invariant(offset <= ctr && ctr <= target && pos <= buflen)
+    invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q)))
+  {
+    val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
+    val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+    pos += 3;
+
+    if (val0 < MLKEM_Q)
+    {
+      r[ctr++] = val0;
+    }
+    if (ctr < target && val1 < MLKEM_Q)
+    {
+      r[ctr++] = val1;
+    }
+  }
+
+  debug_assert_bound(r, ctr, 0, MLKEM_Q);
+  return ctr;
+}
+
+#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * NOTE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
+{
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+{
+  int ret;
+
+  /* Sample from large buffer with full lane as much as possible. */
+  ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
+  if (ret != -1)
+  {
+    unsigned res = offset + (unsigned)ret;
+    debug_assert_bound(r, res, 0, MLKEM_Q);
+    return res;
+  }
+
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
+#endif
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  xof_x4_ctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
+                MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                       &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = XOF_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+  {
+    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  xof_x4_release(&statex);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+{
+  xof_ctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = XOF_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(ctr <= MLKEM_N)
+    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
+  {
+    xof_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
+  }
+
+  xof_release(&state);
+}
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        load32_littleendian
+ *
+ * Description: load 4 bytes into a 32-bit integer
+ *              in little-endian order
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x
+ **************************************************/
+static uint32_t load32_littleendian(const uint8_t x[4])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  r |= (uint32_t)x[3] << 24;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
+  {
+    unsigned j;
+    uint32_t t = load32_littleendian(buf + 4 * i);
+    uint32_t d = t & 0x55555555;
+    d += (t >> 1) & 0x55555555;
+
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
+    {
+      const int16_t a = (d >> (4 * j + 0)) & 0x3;
+      const int16_t b = (d >> (4 * j + 2)) & 0x3;
+      r->coeffs[8 * i + j] = a - b;
+    }
+  }
+}
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+/*************************************************
+ * Name:        load24_littleendian
+ *
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+ **************************************************/
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
+  {
+    unsigned j;
+    const uint32_t t = load24_littleendian(buf + 3 * i);
+    uint32_t d = t & 0x00249249;
+    d += (t >> 1) & 0x00249249;
+    d += (t >> 2) & 0x00249249;
+
+    for (j = 0; j < 4; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 4 && j <= 4)
+      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
+    {
+      const int16_t a = (d >> (6 * j + 0)) & 0x7;
+      const int16_t b = (d >> (6 * j + 3)) & 0x7;
+      r->coeffs[4 * i + j] = a - b;
+    }
+  }
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling)
+int empty_cu_sampling;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/sampling.h
new file mode 100644
index 000000000..cc524e0fc
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/sampling.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef SAMPLING_H
+#define SAMPLING_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "cbmc.h"
+#include "common.h"
+#include "poly.h"
+
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
+/*************************************************
+ * Name:        poly_cbd2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
+/*************************************************
+ * Name:        poly_cbd3
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
+
+#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
+/*************************************************
+ * Name:        poly_rej_uniform_x4
+ *
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
+ *                                     to be sampled.
+ *              - uint8_t *seed[4]:    Pointer to array of four pointers
+ *                                     pointing to the seed buffers of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
+/*************************************************
+ * Name:        poly_rej_uniform
+ *
+ * Description: Generate polynomial using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
+ *              - uint8_t *seed:       Pointer to seed buffer of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+__contract__(
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#endif /* SAMPLING_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c
index 4ef887c62..987f0dce4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c
@@ -10,7 +10,7 @@
 
 #include "common.h"
 #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-#include "ntt.h"
+#include "poly.h"
 
 /*
  * Table of zeta values used in the reference NTT and inverse NTT.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/api.h
deleted file mode 100644
index 792ecb8a4..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/api.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * Native arithmetic interface
- *
- * This header is primarily for documentation purposes.
- * It should not be included by backend implementations.
- *
- * To ensure consistency with backends, the header will be
- * included automatically after inclusion of the active
- * backend, to ensure consistency of function signatures,
- * and run sanity checks.
- */
-#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H
-#error \
-    "The arithmetic backend API `mlkem/native/api.h` "		\
-    "should not be directly included. Please include the relevant "	\
-    "structure headers directly."
-#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
-#define MLKEM_NATIVE_ARITH_NATIVE_API_H
-
-#include <stdint.h>
-#include "poly.h"
-#include "polyvec.h"
-
-/*
- * This is the C<->native interface allowing for the drop-in of
- * native code for performance critical arithmetic components of ML-KEM.
- *
- * A _backend_ is a specific implementation of (part of) this interface.
- *
- * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and
- * implement `static inline xxx(...)` in the profile header.
- *
- * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can
- * be set if there are native implementations for all of NTT, invNTT, and
- * base multiplication, and allows the native implementation to use a
- * custom order of polynomial coefficients in NTT domain -- the use of such
- * custom order is not an implementation-detail since the public matrix
- * is generated in NTT domain. In this case, a permutation function
- * poly_permute_bitrev_to_custom() needs to be provided that permutes
- * polynomials in NTT domain from bitreversed to the custom order.
- */
-
-/*
- * Those functions are meant to be trivial wrappers around the chosen native
- * implementation. The are static inline to avoid unnecessary calls.
- * The macro before each declaration controls whether a native
- * implementation is present.
- */
-
-#if defined(MLKEM_USE_NATIVE_NTT)
-/*************************************************
- * Name:        ntt_native
- *
- * Description: Computes negacyclic number-theoretic transform (NTT) of
- *              a polynomial in place.
- *
- *              The input polynomial is assumed to be in normal order.
- *              The output polynomial is in bitreversed order, or of a
- *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *              for more information.
- *
- * Arguments:   - poly *p: pointer to in/output polynomial
- **************************************************/
-static INLINE void ntt_native(poly *);
-#endif /* MLKEM_USE_NATIVE_NTT */
-
-#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
-/*
- * This must only be set if NTT, invNTT, basemul, mulcache, and
- * to/from byte stream conversions all have native implementations
- * that are adapted to the custom order.
- */
-#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \
-    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) ||                  \
-    !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) ||  \
-    !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) ||                           \
-    !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-#error \
-    "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \
-set if there are native implementations for NTT, invNTT, mulcache, basemul, \
-and to/from bytes conversions."
-#endif
-
-/*************************************************
- * Name:        poly_permute_bitrev_to_custom
- *
- * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined,
- *              convert a polynomial in NTT domain from bitreversed
- *              order to the custom order output by the native NTT.
- *
- *              This must only be defined if there is native code for
- *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
- * Arguments:   - poly *p: pointer to in/output polynomial
- *
- **************************************************/
-static INLINE void poly_permute_bitrev_to_custom(poly *);
-#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
-
-#if defined(MLKEM_USE_NATIVE_INTT)
-/*************************************************
- * Name:        intt_native
- *
- * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
- *              of a polynomial in place.
- *
- *              The input polynomial is in bitreversed order, or of a
- *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *              for more information.
- *              The output polynomial is assumed to be in normal order.
- *
- * Arguments:   - uint16_t *a: pointer to in/output polynomial
- **************************************************/
-static INLINE void intt_native(poly *);
-#endif /* MLKEM_USE_NATIVE_INTT */
-
-#if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
-/*************************************************
- * Name:        poly_reduce_native
- *
- * Description: Applies modular reduction to all coefficients of a polynomial.
- *
- * Arguments:   - poly *r: pointer to input/output polynomial
- **************************************************/
-static INLINE void poly_reduce_native(poly *);
-#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
-
-#if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
-/*************************************************
- * Name:        poly_tomont_native
- *
- * Description: Inplace conversion of all coefficients of a polynomial
- *              from normal domain to Montgomery domain
- *
- * Arguments:   - poly *r: pointer to input/output polynomial
- **************************************************/
-static INLINE void poly_tomont_native(poly *);
-#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
-
-#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
-/*************************************************
- * Name:        poly_mulcache_compute_native
- *
- * Description: Compute multiplication cache for a polynomial
- *              in NTT domain.
- *
- *              The purpose of the multiplication cache is to
- *              cache repeated computations required during a
- *              base multiplication of polynomials in NTT domain.
- *              The structure of the multiplication-cache is
- *              implementation defined.
- *
- * Arguments:   INPUT:
- *              - poly: const pointer to input polynomial.
- *                  This must be in NTT domain and inin bitreversed order, or of
- *                  a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *                  See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *                  for more information.
- *              OUTPUT
- *              - cache: pointer to multiplication cache
- **************************************************/
-static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
-                                                const poly *poly);
-#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
-
-#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
-/*************************************************
- * Name:        poly_mulcache_compute_native
- *
- * Description: Compute multiplication of polynomials in NTT domain.
- *
- * Arguments:   INPUT:
- *              - a: First polynomial operand.
- *                 This must be in NTT domain and inin bitreversed order, or of
- *                 a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *                 See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *                 for more information.
- *              - b: Second polynomial operand.
- *                 As for a.
- *              - b_cache: Multiplication-cache for b.
- *              OUTPUT
- *              - r: Result of the base multiplication. This is again
- *                   in NTT domain, and of the same order as a and b.
- **************************************************/
-static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache);
-#endif
-
-#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
-/*************************************************
- * Name:        poly_tobytes_native
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - a: const pointer to input polynomial,
- *                with each coefficient in the range -Q+1 .. Q-1
- *              OUTPUT
- *              - r: pointer to output byte array
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a);
-#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
-
-#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-/*************************************************
- * Name:        poly_frombytes_native
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - r: pointer to output polynomial in NTT domain
- *              OUTPUT
- *              - a: const pointer to input byte aray
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-static INLINE void poly_frombytes_native(poly *a,
-                                         const uint8_t r[MLKEM_POLYBYTES]);
-#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-
-#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-/*************************************************
- * Name:        rej_uniform_native
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int len:    requested number of 16-bit integers
- *                                     (uniform mod q).
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes.
- *
- * Return -1 if the native implementation does not support the input lengths.
- * Otherwise, returns non-negative number of sampled 16-bit integers (at most
- * len).
- **************************************************/
-static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
-                                     const uint8_t *buf, unsigned int buflen);
-#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/arith_backend.h
index 0543b1bd1..ade31cda1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/arith_backend.h
@@ -17,7 +17,7 @@
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
 #if defined(MLKEM_NATIVE_CHECK_APIS)
-#include "api.h"
+#include "native/api.h"
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.c
deleted file mode 100644
index 1e6b7c5d1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
-
-#include <stdint.h>
-#include "cbd.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
-#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-/* End of static namespacing */
-
-/*************************************************
- * Name:        load32_littleendian
- *
- * Description: load 4 bytes into a 32-bit integer
- *              in little-endian order
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x
- **************************************************/
-static uint32_t load32_littleendian(const uint8_t x[4])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  r |= (uint32_t)x[3] << 24;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
-  {
-    unsigned j;
-    uint32_t t = load32_littleendian(buf + 4 * i);
-    uint32_t d = t & 0x55555555;
-    d += (t >> 1) & 0x55555555;
-
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
-    {
-      const int16_t a = (d >> (4 * j + 0)) & 0x3;
-      const int16_t b = (d >> (4 * j + 2)) & 0x3;
-      r->coeffs[8 * i + j] = a - b;
-    }
-  }
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 4)
-    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
-  {
-    unsigned j;
-    const uint32_t t = load24_littleendian(buf + 3 * i);
-    uint32_t d = t & 0x00249249;
-    d += (t >> 1) & 0x00249249;
-    d += (t >> 2) & 0x00249249;
-
-    for (j = 0; j < 4; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 4 && j <= 4)
-      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
-    {
-      const int16_t a = (d >> (6 * j + 0)) & 0x7;
-      const int16_t b = (d >> (6 * j + 3)) & 0x7;
-      r->coeffs[4 * i + j] = a - b;
-    }
-  }
-}
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
-          3 */
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
-int empty_cu_cbd;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.h
deleted file mode 100644
index 54c1f5b90..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/cbd.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef CBD_H
-#define CBD_H
-
-#include <stdint.h>
-#include "common.h"
-#include "poly.h"
-
-#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
-/*************************************************
- * Name:        poly_cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
-/*************************************************
- * Name:        poly_cbd3
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
-
-#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h
index 4f326333e..62ed53ab1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h
@@ -15,12 +15,19 @@
 #include "sys.h"
 
 /* Include backend metadata */
-#if defined(MLKEM_USE_NATIVE)
-#if defined(MLKEM_NATIVE_ARITH_BACKEND)
-#include MLKEM_NATIVE_ARITH_BACKEND
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#include MLKEM_NATIVE_ARITH_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not.
+#endif
 #endif
-#if defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#include MLKEM_NATIVE_FIPS202_BACKEND
+
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#include MLKEM_NATIVE_FIPS202_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not.
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/compress.c
new file mode 100644
index 000000000..a03fe0ac4
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/compress.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
+#include <stdint.h>
+#include <string.h>
+#include "arith_backend.h"
+#include "cbmc.h"
+#include "compress.h"
+#include "debug.h"
+#include "verify.h"
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
+    {
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
+    }
+
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(invariant(j <= MLKEM_N / 4))
+  {
+    unsigned k;
+    uint16_t t[4];
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(forall(r, 0, k, t[r] < (1u << 10))))
+    {
+      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 10-bit in size.
+     */
+    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
+    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
+    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
+    r[5 * j + 4] = (t[3] >> 2);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  {
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 4)
+    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[4];
+    uint8_t const *base = &a[5 * j];
+
+    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
+    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
+    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
+
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 32)))
+    {
+      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
+    }
+
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC, and use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     */
+    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
+    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
+    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
+  {
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    uint8_t t[8];
+    const unsigned offset = i * 5;
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC and unwind loop for ease
+     * of proof.
+     */
+
+    /*
+     * Decompress 5 8-bit bytes (so 40 bits) into
+     * 8 5-bit values stored in t[]
+     */
+    t[0] = 0x1F & (a[offset + 0] >> 0);
+    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
+    t[2] = 0x1F & (a[offset + 1] >> 2);
+    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
+    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
+    t[5] = 0x1F & (a[offset + 3] >> 1);
+    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
+    t[7] = 0x1F & (a[offset + 4] >> 3);
+
+    /* and copy to the correct slice in r[] */
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(j <= 8 && i <= MLKEM_N / 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(invariant(i <= MLKEM_N / 2))
+  {
+    const uint16_t t0 = a->coeffs[2 * i];
+    const uint16_t t1 = a->coeffs[2 * i + 1];
+    /*
+     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
+     * significant data, so these can be packed into 24 bits or exactly
+     * 3 bytes, as follows.
+     */
+
+    /* Least significant bits 0 - 7 of t0. */
+    r[3 * i + 0] = t0 & 0xFF;
+
+    /*
+     * Most significant bits 8 - 11 of t0 become the least significant
+     * nibble of the second byte. The least significant 4 bits
+     * of t1 become the upper nibble of the second byte.
+     */
+    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+
+    /* Bits 4 - 11 of t1 become the third byte. */
+    r[3 * i + 2] = t1 >> 4;
+  }
+}
+#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  poly_tobytes_native(r, a->coeffs);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
+  {
+    const uint8_t t0 = a[3 * i + 0];
+    const uint8_t t1 = a[3 * i + 1];
+    const uint8_t t2 = a[3 * i + 2];
+    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
+    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+  }
+
+  /* Note that the coefficients are not canonical */
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  poly_frombytes_native(r->coeffs, a);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+{
+  unsigned i;
+#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
+#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
+#endif
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <  MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      /* Prevent the compiler from recognizing this as a bit selection */
+      uint8_t mask = value_barrier_u8(1u << j);
+      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
+    }
+  }
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    msg[i] = 0;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8))
+    {
+      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
+      msg[i] |= t << j;
+    }
+  }
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress)
+int empty_cu_compress;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/compress.h
new file mode 100644
index 000000000..409dbe519
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/compress.h
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef COMPRESS_H
+#define COMPRESS_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+#include "debug.h"
+#include "poly.h"
+#include "verify.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
+#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
+#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
+#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
+#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
+#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
+#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
+#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
+#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
+/* End of static namespacing */
+
+/************************************************************
+ * Name: scalar_compress_d1
+ *
+ * Description: Computes round(u * 2 / q)
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 1.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d1(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 2)
+  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
+{
+  uint32_t d0 = u << 1;
+  d0 *= 645083;
+  d0 += 1u << 30;
+  d0 >>= 31;
+  return d0;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_compress_d4
+ *
+ * Description: Computes round(u * 16 / q) % 16
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d4(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 16)
+  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
+{
+  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
+  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d4
+ *
+ * Description: Computes round(u * q / 16)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d4(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 16)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 8) / 16; }
+
+/************************************************************
+ * Name: scalar_compress_d5
+ *
+ * Description: Computes round(u * 32 / q) % 32
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d5(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 32)
+  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
+{
+  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
+  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d5
+ *
+ * Description: Computes round(u * q / 32)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 32
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d5(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 32)
+  ensures(return_value <= MLKEM_Q - 1)
+) { return ((u * MLKEM_Q) + 16) / 32; }
+
+/************************************************************
+ * Name: scalar_compress_d10
+ *
+ * Description: Computes round(u * 2**10 / q) % 2**10
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d10(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 10))
+  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
+{
+  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x3FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d10
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d10(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 1024)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 512) / 1024; }
+
+/************************************************************
+ * Name: scalar_compress_d11
+ *
+ * Description: Computes round(u * 2**11 / q) % 2**11
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 11.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d11(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 11))
+  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
+{
+  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x7FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d11
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d11(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 2048)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 1024) / 2048; }
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
+/*************************************************
+ * Name:        poly_compress_d4
+ *
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
+
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
+/*************************************************
+ * Name:        poly_decompress_d4
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
+/*************************************************
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
+
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
+/*************************************************
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
+
+#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
+/*************************************************
+ * Name:        poly_tobytes
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range [0,1,..,Q-1]
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYBYTES))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r))
+);
+
+
+#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
+/*************************************************
+ * Name:        poly_frombytes
+ *
+ * Description: De-serialization of a polynomial.
+ *
+ * Arguments:   INPUT
+ *              - a: pointer to input byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ *              OUTPUT
+ *              - r: pointer to output polynomial, with
+ *                   each coefficient unsigned and in the range
+ *                   0 .. 4095
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
+);
+
+
+#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
+/*************************************************
+ * Name:        poly_frommsg
+ *
+ * Description: Convert 32-byte message to polynomial
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *msg: pointer to input message
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
+#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
+/*************************************************
+ * Name:        poly_tomsg
+ *
+ * Description: Convert polynomial to 32-byte message
+ *
+ * Arguments:   - uint8_t *msg: pointer to output message
+ *              - const poly *r: pointer to input polynomial
+ *                Coefficients must be unsigned canonical
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(msg))
+);
+
+#endif /* COMPRESS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h
index fa89370ce..e975ede95 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h
@@ -122,46 +122,87 @@
 /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
- * Name:        MLKEM_USE_NATIVE
+ * Name:        MLKEM_USE_NATIVE_BACKEND_ARITH
  *
- * Description: Determines whether a native backend should
- *              be used, if available.
+ * Description: Determines whether an native arithmetic backend should be used.
+ *
+ *              The arithmetic backend covers performance critical functions
+ *              such as the number-theoretic transform (NTT).
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the arithmetic backend to be use is
+ *              determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(MLKEM_USE_NATIVE)
-/* #define MLKEM_USE_NATIVE */
+#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */
 #endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_ARITH_BACKEND
+ * Name:        MLKEM_NATIVE_ARITH_BACKEND_FILE
  *
  * Description: The arithmetic backend to use.
  *
- *              This must be the filename of an arithmetic backend.
- *              See the existing backends for examples.
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option
+ *              is ignored.
+ *
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must
+ *              either be undefined or the filename of an arithmetic backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND)
-#define MLKEM_NATIVE_ARITH_BACKEND "default.h"
-#endif /* MLKEM_NATIVE_ARITH_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \
+    !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h"
+#endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_FIPS202_BACKEND
+ * Name:        MLKEM_USE_NATIVE_BACKEND_FIPS202
+ *
+ * Description: Determines whether an native FIPS202 backend should be used.
+ *
+ *              The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is
+ *              the performance bottleneck of SHA3 and SHAKE.
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the FIPS202 backend to be use is
+ *              determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */
+#endif
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_FIPS202_BACKEND_FILE
  *
  * Description: The FIPS-202 backend to use.
  *
- *              This must be the filename of an FIPS-202 backend.
+ *              If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must
+ *              either be undefined or the filename of a FIPS202 backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h"
-#endif /* MLKEM_NATIVE_FIPS202_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \
+    !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h"
+#endif
 
 /*************************  Config internals  ********************************/
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/default.h
deleted file mode 100644
index d1e41c52e..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/default.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
-#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
-
-/*
- * Default arithmetic backend
- */
-#include "sys.h"
-
-#ifdef SYS_AARCH64
-/*
- * For AArch64, we currently we have one clean and one opt profile.
- * We default to the opt profile.
- *
- * In the future, this may branch further depending on the microarchitecture.
- */
-#include "aarch64/opt.h"
-#endif /* SYS_AARCH64 */
-
-#ifdef SYS_X86_64_AVX2
-/*
- * For now, there's only one x86_64 profile, based on
- * the AVX2 code from the Kyber repository.
- * https://github.com/pq-crystals/kyber
- */
-#include "x86_64/default.h"
-#endif /* SYS_X86_64 */
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.c
index 0cfcc3e9e..318d0fc77 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.c
@@ -9,11 +9,10 @@
 #include "fips202.h"
 #include "fips202x4.h"
 #include "indcpa.h"
-#include "ntt.h"
 #include "poly.h"
-#include "polyvec.h"
+#include "poly_k.h"
 #include "randombytes.h"
-#include "rej_uniform.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 #include "arith_backend.h"
@@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v,
 #define poly_permute_bitrev_to_custom \
   MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
-static INLINE void poly_permute_bitrev_to_custom(poly *data)
+static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
 __contract__(
   /* We don't specify that this should be a permutation, but only
    * that it does not change the bound established at the end of gen_matrix. */
-  requires(memory_no_alias(data, sizeof(poly)))
-  requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
+  requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
   assigns(memory_slice(data, sizeof(poly)))
-  ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
+  ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
 #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
 
 /* Not static for benchmarking */
@@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
   {
     for (j = 0; j < MLKEM_K; j++)
     {
-      poly_permute_bitrev_to_custom(&a[i].vec[j]);
+      poly_permute_bitrev_to_custom(a[i].vec[j].coeffs);
     }
   }
 }
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.h
index 2c4fda3c4..b4d5985bf 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/indcpa.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "polyvec.h"
+#include "poly_k.h"
 
 #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/api.h
new file mode 100644
index 000000000..0704f9dcd
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/api.h
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * Native arithmetic interface
+ *
+ * This header is primarily for documentation purposes.
+ * It should not be included by backend implementations.
+ *
+ * To ensure consistency with backends, the header will be
+ * included automatically after inclusion of the active
+ * backend, to ensure consistency of function signatures,
+ * and run sanity checks.
+ */
+#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H
+#error \
+    "The arithmetic backend API `mlkem/native/api.h` "		\
+    "should not be directly included. Please include the relevant "	\
+    "structure headers directly."
+#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
+#define MLKEM_NATIVE_ARITH_NATIVE_API_H
+
+#include <stdint.h>
+#include "../common.h"
+
+/*
+ * This is the C<->native interface allowing for the drop-in of
+ * native code for performance critical arithmetic components of ML-KEM.
+ *
+ * A _backend_ is a specific implementation of (part of) this interface.
+ *
+ * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and
+ * implement `static inline xxx(...)` in the profile header.
+ *
+ * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can
+ * be set if there are native implementations for all of NTT, invNTT, and
+ * base multiplication, and allows the native implementation to use a
+ * custom order of polynomial coefficients in NTT domain -- the use of such
+ * custom order is not an implementation-detail since the public matrix
+ * is generated in NTT domain. In this case, a permutation function
+ * poly_permute_bitrev_to_custom() needs to be provided that permutes
+ * polynomials in NTT domain from bitreversed to the custom order.
+ */
+
+/*
+ * Those functions are meant to be trivial wrappers around the chosen native
+ * implementation. The are static inline to avoid unnecessary calls.
+ * The macro before each declaration controls whether a native
+ * implementation is present.
+ */
+
+#if defined(MLKEM_USE_NATIVE_NTT)
+/*************************************************
+ * Name:        ntt_native
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input polynomial is assumed to be in normal order.
+ *              The output polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
+ **************************************************/
+static INLINE void ntt_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
+/*
+ * This must only be set if NTT, invNTT, basemul, mulcache, and
+ * to/from byte stream conversions all have native implementations
+ * that are adapted to the custom order.
+ */
+#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) ||                  \
+    !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) ||  \
+    !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) ||                           \
+    !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+#error \
+    "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \
+set if there are native implementations for NTT, invNTT, mulcache, basemul, \
+and to/from bytes conversions."
+#endif
+
+/*************************************************
+ * Name:        poly_permute_bitrev_to_custom
+ *
+ * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined,
+ *              convert a polynomial in NTT domain from bitreversed
+ *              order to the custom order output by the native NTT.
+ *
+ *              This must only be defined if there is native code for
+ *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
+ *
+ **************************************************/
+static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
+
+#if defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        intt_native
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place.
+ *
+ *              The input polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *              The output polynomial is assumed to be in normal order.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+static INLINE void intt_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_INTT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/*************************************************
+ * Name:        poly_reduce_native
+ *
+ * Description: Applies modular reduction to all coefficients of a polynomial.
+ *
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_reduce_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
+/*************************************************
+ * Name:        poly_tomont_native
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              from normal domain to Montgomery domain
+ *
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_tomont_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication cache for a polynomial
+ *              in NTT domain.
+ *
+ *              The purpose of the multiplication cache is to
+ *              cache repeated computations required during a
+ *              base multiplication of polynomials in NTT domain.
+ *              The structure of the multiplication-cache is
+ *              implementation defined.
+ *
+ * Arguments:   INPUT:
+ *              - poly: const pointer to input polynomial.
+ *                  This must be in NTT domain and inin bitreversed order, or of
+ *                  a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                  See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                  for more information.
+ *              OUTPUT
+ *              - cache: pointer to multiplication cache
+ **************************************************/
+static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2],
+                                                const int16_t poly[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication of polynomials in NTT domain.
+ *
+ * Arguments:   INPUT:
+ *              - a: First polynomial operand.
+ *                 This must be in NTT domain and inin bitreversed order, or of
+ *                 a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                 See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                 for more information.
+ *              - b: Second polynomial operand.
+ *                 As for a.
+ *              - b_cache: Multiplication-cache for b.
+ *              OUTPUT
+ *              - r: Result of the base multiplication. This is again
+ *                   in NTT domain, and of the same order as a and b.
+ **************************************************/
+static INLINE void polyvec_basemul_acc_montgomery_cached_native(
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]);
+#endif
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+/*************************************************
+ * Name:        poly_tobytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range -Q+1 .. Q-1
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+                                       const int16_t a[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+/*************************************************
+ * Name:        poly_frombytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - r: pointer to output polynomial in NTT domain
+ *              OUTPUT
+ *              - a: const pointer to input byte aray
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_frombytes_native(int16_t a[MLKEM_N],
+                                         const uint8_t r[MLKEM_POLYBYTES]);
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+/*************************************************
+ * Name:        rej_uniform_native
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int len:    requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes.
+ *
+ * Return -1 if the native implementation does not support the input lengths.
+ * Otherwise, returns non-negative number of sampled 16-bit integers (at most
+ * len).
+ **************************************************/
+static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
+                                     const uint8_t *buf, unsigned int buflen);
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/default.h
new file mode 100644
index 000000000..f9fe4310a
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/default.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
+#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
+
+/*
+ * Default arithmetic backend
+ */
+#include "../sys.h"
+
+#ifdef SYS_AARCH64
+/*
+ * For AArch64, we currently we have one clean and one opt profile.
+ * We default to the opt profile.
+ *
+ * In the future, this may branch further depending on the microarchitecture.
+ */
+#include "aarch64/opt.h"
+#endif /* SYS_AARCH64 */
+
+#ifdef SYS_X86_64_AVX2
+/*
+ * For now, there's only one x86_64 profile, based on
+ * the AVX2 code from the Kyber repository.
+ * https://github.com/pq-crystals/kyber
+ */
+#include "x86_64/default.h"
+#endif /* SYS_X86_64 */
+
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/README.md
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/README.md
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/README.md
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/default.h
similarity index 90%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/default.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/default.h
index 592e8996d..73f53dc13 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/default.h
@@ -19,6 +19,6 @@
 /* Filename of the C backend implementation.
  * This is not inlined here because this header is included in assembly
  * files as well. */
-#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "x86_64/src/default_impl.h"
+#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/x86_64/src/default_impl.h"
 
 #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/align.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/align.h
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/align.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/align.h
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/arith_native_x86_64.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/arith_native_x86_64.h
similarity index 91%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/arith_native_x86_64.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/arith_native_x86_64.h
index 25e00a930..acde977ad 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/arith_native_x86_64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/arith_native_x86_64.h
@@ -5,11 +5,10 @@
 #ifndef MLKEM_X86_64_NATIVE_H
 #define MLKEM_X86_64_NATIVE_H
 
-#include "common.h"
+#include "../../../common.h"
 
 #include <immintrin.h>
 #include <stdint.h>
-#include "polyvec.h"
 #include "consts.h"
 
 #define REJ_UNIFORM_AVX_NBLOCKS 3 /* See MLKEM_GEN_MATRIX_NBLOCKS */
@@ -44,8 +43,9 @@ void basemul_avx2(__m256i *r, const __m256i *a, const __m256i *b,
 #define polyvec_basemul_acc_montgomery_cached_avx2 \
   MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_avx2)
 void polyvec_basemul_acc_montgomery_cached_avx2(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache);
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]);
 
 #define ntttobytes_avx2 MLKEM_NAMESPACE(ntttobytes_avx2)
 void ntttobytes_avx2(uint8_t *r, const __m256i *a, const __m256i *qdata);
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S
index b97840e70..5fdc3d0a0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S
@@ -6,7 +6,7 @@
 // Implementation from Kyber reference repository
 // https://github.com/pq-crystals/kyber/blob/main/avx2
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
 #include "consts.h"
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.c
similarity index 51%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.c
index 5f9ae99c8..970938306 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.c
@@ -3,46 +3,46 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
-#include "poly.h"
-#include "polyvec.h"
-
 #include "arith_native_x86_64.h"
 #include "consts.h"
 
-static void poly_basemul_montgomery_avx2(poly *r, const poly *a, const poly *b)
+static void poly_basemul_montgomery_avx2(int16_t r[MLKEM_N],
+                                         const int16_t a[MLKEM_N],
+                                         const int16_t b[MLKEM_N])
 {
-  basemul_avx2((__m256i *)r->coeffs, (const __m256i *)a->coeffs,
-               (const __m256i *)b->coeffs, qdata.vec);
+  basemul_avx2((__m256i *)r, (const __m256i *)a, (const __m256i *)b, qdata.vec);
 }
 
 /*
  * Implementation from Kyber reference repository
  * https://github.com/pq-crystals/kyber/blob/main/avx2
  */
-static void poly_add_avx2(poly *r, const poly *a, const poly *b)
+static void poly_add_avx2(int16_t r[MLKEM_N], const int16_t a[MLKEM_N],
+                          const int16_t b[MLKEM_N])
 {
   unsigned i;
   __m256i f0, f1;
 
   for (i = 0; i < MLKEM_N; i += 16)
   {
-    f0 = _mm256_load_si256((const __m256i *)&a->coeffs[i]);
-    f1 = _mm256_load_si256((const __m256i *)&b->coeffs[i]);
+    f0 = _mm256_load_si256((const __m256i *)&a[i]);
+    f1 = _mm256_load_si256((const __m256i *)&b[i]);
     f0 = _mm256_add_epi16(f0, f1);
-    _mm256_store_si256((__m256i *)&r->coeffs[i], f0);
+    _mm256_store_si256((__m256i *)&r[i], f0);
   }
 }
 
-void polyvec_basemul_acc_montgomery_cached_avx2(poly *r, const polyvec *a,
-                                                const polyvec *b,
-                                                const polyvec_mulcache *b_cache)
+void polyvec_basemul_acc_montgomery_cached_avx2(
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)])
 {
   unsigned i;
-  poly t;
+  int16_t t[MLKEM_N] ALIGN;
 
   /* TODO: Use mulcache for AVX2. So far, it is unused. */
   ((void)b_cache);
@@ -50,11 +50,11 @@ void polyvec_basemul_acc_montgomery_cached_avx2(poly *r, const polyvec *a,
   /* Coefficient-wise bound of each basemul is 2q.
    * Since we are accumulating at most 4 times, the
    * overall bound is 8q < INT16_MAX. */
-  poly_basemul_montgomery_avx2(r, &a->vec[0], &b->vec[0]);
+  poly_basemul_montgomery_avx2(r, &a[0], &b[0]);
   for (i = 1; i < MLKEM_K; i++)
   {
-    poly_basemul_montgomery_avx2(&t, &a->vec[i], &b->vec[i]);
-    poly_add_avx2(r, r, &t);
+    poly_basemul_montgomery_avx2(t, &a[i * MLKEM_N], &b[i * MLKEM_N]);
+    poly_add_avx2(r, r, t);
   }
 }
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.c
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.c
index 86a0835ef..568752ae8 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.c
@@ -8,7 +8,7 @@
  * https://github.com/pq-crystals/kyber/blob/main/avx2/consts.c
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.h
similarity index 97%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.h
index 00c415952..e2846b609 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/consts.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.h
@@ -11,7 +11,7 @@
 #ifndef CONSTS_H
 #define CONSTS_H
 
-#include "common.h"
+#include "../../../common.h"
 
 #define AVX2_BACKEND_DATA_OFFSET_16XQ 0
 #define AVX2_BACKEND_DATA_OFFSET_16XQINV 16
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/default_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/default_impl.h
similarity index 62%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/default_impl.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/default_impl.h
index 029111c17..3683361e2 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/default_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/default_impl.h
@@ -12,8 +12,7 @@
 
 #include <string.h>
 
-#include "poly.h"
-#include "polyvec.h"
+#include "../../../params.h"
 #include "arith_native_x86_64.h"
 
 #define MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
@@ -28,9 +27,9 @@
 #define MLKEM_USE_NATIVE_POLY_TOBYTES
 #define MLKEM_USE_NATIVE_POLY_FROMBYTES
 
-static INLINE void poly_permute_bitrev_to_custom(poly *data)
+static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
 {
-  nttunpack_avx2((__m256i *)(data->coeffs), qdata.vec);
+  nttunpack_avx2((__m256i *)(data), qdata.vec);
 }
 
 static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
@@ -45,27 +44,28 @@ static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
   return (int)rej_uniform_avx2(r, buf);
 }
 
-static INLINE void ntt_native(poly *data)
+static INLINE void ntt_native(int16_t data[MLKEM_N])
 {
   ntt_avx2((__m256i *)data, qdata.vec);
 }
 
-static INLINE void intt_native(poly *data)
+static INLINE void intt_native(int16_t data[MLKEM_N])
 {
   invntt_avx2((__m256i *)data, qdata.vec);
 }
 
-static INLINE void poly_reduce_native(poly *data)
+static INLINE void poly_reduce_native(int16_t data[MLKEM_N])
 {
-  reduce_avx2((__m256i *)data->coeffs, qdata.vec);
+  reduce_avx2((__m256i *)data, qdata.vec);
 }
 
-static INLINE void poly_tomont_native(poly *data)
+static INLINE void poly_tomont_native(int16_t data[MLKEM_N])
 {
-  tomont_avx2((__m256i *)data->coeffs, qdata.vec);
+  tomont_avx2((__m256i *)data, qdata.vec);
 }
 
-static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y)
+static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
+                                                const int16_t y[MLKEM_N])
 {
   /* AVX2 backend does not use mulcache */
   ((void)y);
@@ -73,22 +73,23 @@ static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y)
 }
 
 static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache)
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)])
 {
   polyvec_basemul_acc_montgomery_cached_avx2(r, a, b, b_cache);
 }
 
 static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a)
+                                       const int16_t a[MLKEM_N])
 {
-  ntttobytes_avx2(r, (const __m256i *)a->coeffs, qdata.vec);
+  ntttobytes_avx2(r, (const __m256i *)a, qdata.vec);
 }
 
-static INLINE void poly_frombytes_native(poly *r,
+static INLINE void poly_frombytes_native(int16_t r[MLKEM_N],
                                          const uint8_t a[MLKEM_POLYBYTES])
 {
-  nttfrombytes_avx2((__m256i *)r->coeffs, a, qdata.vec);
+  nttfrombytes_avx2((__m256i *)r, a, qdata.vec);
 }
 
 #endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S
similarity index 98%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S
index 134bd4f71..3f013a5fa 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S
@@ -11,7 +11,7 @@
 //   in [0,1,...,q-1] rather than [0,1,...,q], matching the
 //   semantics of poly_reduce().
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 #include "consts.h"
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.inc b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.inc
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.inc
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.inc
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/intt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S
index 6b1d78ef2..7b1f22624 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/intt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S
@@ -9,7 +9,7 @@
  * Changes to placement of modular reductions have
  * been made to simplify reasoning of non-overflow */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/ntt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S
index e8bf7894b..5d928b4cc 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/ntt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S
@@ -6,7 +6,7 @@
 // Implementation from Kyber reference repository
 // https://github.com/pq-crystals/kyber/blob/main/avx2
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
 #include "consts.h"
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_avx2.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_avx2.c
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_avx2.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_avx2.c
index 54037a0df..adf2d338b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_avx2.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_avx2.c
@@ -8,7 +8,7 @@
  * https://github.com/pq-crystals/kyber/blob/main/avx2
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_table.c
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_table.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_table.c
index 9bbc47146..e95fd9e79 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/rej_uniform_table.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_table.c
@@ -8,7 +8,7 @@
  *          Do not modify it directly.
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S
index 5e708748a..9bcd04896 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S
@@ -6,7 +6,7 @@
 // Implementation from Kyber reference repository
 // https://github.com/pq-crystals/kyber/blob/main/avx2
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.inc b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.inc
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.inc
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.inc
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/x86_64_zetas.i b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/x86_64_zetas.i
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/x86_64_zetas.i
rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/x86_64_zetas.i
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.c
deleted file mode 100644
index 3651c8da9..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-
-#include <stdint.h>
-#include "arith_backend.h"
-#include "debug.h"
-#include "ntt.h"
-#include "reduce.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
-#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
-#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
-/* End of static namespacing */
-
-#if !defined(MLKEM_USE_NATIVE_NTT)
-/*
- * Computes a block CT butterflies with a fixed twiddle factor,
- * using Montgomery multiplication.
- * Parameters:
- * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
- * - root: Twiddle factor to use for the butterfly. This must be in
- *         Montgomery form and signed canonical.
- * - start: Offset to the beginning of the butterfly block
- * - len: Index difference between coefficients subject to a butterfly
- * - bound: Ghost variable describing coefficient bound: Prior to `start`,
- *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
- *          they must be bound by `bound`.
- * When this function returns, output coefficients in the index range
- * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
- * Example:
- * - start=8, len=4
- *   This would compute the following four butterflies
- *          8     --    12
- *             9    --     13
- *                10   --     14
- *                   11   --     15
- * - start=4, len=2
- *   This would compute the following two butterflies
- *          4 -- 6
- *             5 -- 7
- */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
-                                unsigned start, unsigned len, int bound)
-__contract__(
-  requires(start < MLKEM_N)
-  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
-  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
-  requires(-HALF_Q < zeta && zeta < HALF_Q)
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
-  requires(array_abs_bound(r, start, MLKEM_N, bound))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
-  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
-{
-  /* `bound` is a ghost variable only needed in the CBMC specification */
-  unsigned j;
-  ((void)bound);
-  for (j = start; j < start + len; j++)
-  __loop__(
-    invariant(start <= j && j <= start + len)
-    /*
-     * Coefficients are updated in strided pairs, so the bounds for the
-     * intermediate states alternate twice between the old and new bound
-     */
-    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j,           start + len, bound))
-    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
-  {
-    int16_t t;
-    t = fqmul(r[j + len], zeta);
-    r[j + len] = r[j] - t;
-    r[j] = r[j] + t;
-  }
-}
-
-/*
- *Compute one layer of forward NTT
- * Parameters:
- * - r: Pointer to base of polynomial
- * - len: Stride of butterflies in this layer.
- * - layer: Ghost variable indicating which layer is being applied.
- *          Must match `len` via `len == MLKEM_N >> layer`.
- * Note: `len` could be dropped and computed in the function, but
- *   we are following the structure of the reference NTT from the
- *   official Kyber implementation here, merely adding `layer` as
- *   a ghost variable for the specifications.
- */
-static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
-  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable only needed in the CBMC specification */
-  ((void)layer);
-  /* Twiddle factors for layer n start at index 2^(layer-1) */
-  k = MLKEM_N / (2 * len);
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(start < MLKEM_N + 2 * len)
-    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
-    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
-    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
-  {
-    int16_t zeta = zetas[k++];
-    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
-  }
-}
-
-/*
- * Compute full forward NTT
- * NOTE: This particular implementation satisfies a much tighter
- * bound on the output coefficients (5*q) than the contractual one (8*q),
- * but this is not needed in the calling code. Should we change the
- * base multiplication strategy to require smaller NTT output bounds,
- * the proof may need strengthening.
- */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  unsigned len, layer;
-  int16_t *r;
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  r = p->coeffs;
-
-  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
-  __loop__(
-    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
-    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
-  {
-    ntt_layer(r, len, layer);
-  }
-
-  /* Check the stronger bound */
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_NTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  ntt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_NTT */
-
-#if !defined(MLKEM_USE_NATIVE_INTT)
-
-/* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
-  requires(len == (1 << (8 - layer)))
-  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable used only in the specification */
-  ((void)layer);
-  k = MLKEM_N / len - 1;
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(start <= MLKEM_N && k <= 127)
-    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
-    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
-  {
-    unsigned j;
-    int16_t zeta = zetas[k--];
-    for (j = start; j < start + len; j++)
-    __loop__(
-      invariant(start <= j && j <= start + len)
-      invariant(start <= MLKEM_N && k <= 127)
-      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-    {
-      int16_t t = r[j];
-      r[j] = barrett_reduce(t + r[j + len]);
-      r[j + len] = r[j + len] - t;
-      r[j + len] = fqmul(r[j + len], zeta);
-    }
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  /*
-   * Scale input polynomial to account for Montgomery factor
-   * and NTT twist. This also brings coefficients down to
-   * absolute value < MLKEM_Q.
-   */
-  unsigned j, len, layer;
-  const int16_t f = 1441;
-  int16_t *r = p->coeffs;
-
-  for (j = 0; j < MLKEM_N; j++)
-  __loop__(
-    invariant(j <= MLKEM_N)
-    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
-  {
-    r[j] = fqmul(r[j], f);
-  }
-
-  /* Run the invNTT layers */
-  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
-  __loop__(
-    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-  {
-    invntt_layer(p->coeffs, len, layer);
-  }
-
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  intt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-{
-  int32_t t0, t1;
-  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
-
-  t0 = (int32_t)a[1] * b_cached;
-  t0 += (int32_t)a[0] * b[0];
-  t1 = (int32_t)a[0] * b[1];
-  t1 += (int32_t)a[1] * b[0];
-
-  /* |ti| < 2 * q * 2^15 */
-  r[0] = montgomery_reduce(t0);
-  r[1] = montgomery_reduce(t1);
-
-  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
-}
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
-int empty_cu_ntt;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.h
deleted file mode 100644
index 4e80d3ab3..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/ntt.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef NTT_H
-#define NTT_H
-#include "common.h"
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "poly.h"
-#include "reduce.h"
-
-#define zetas MLKEM_NAMESPACE(zetas)
-extern const int16_t zetas[128];
-
-#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
-/*************************************************
- * Name:        poly_ntt
- *
- * Description: Computes negacyclic number-theoretic transform (NTT) of
- *              a polynomial in place.
- *
- *              The input is assumed to be in normal order and
- *              coefficient-wise bound by MLKEM_Q in absolute value.
- *
- *              The output polynomial is in bitreversed order, and
- *              coefficient-wise bound by NTT_BOUND in absolute value.
- *
- *              (NOTE: Sometimes the input to the NTT is actually smaller,
- *               which gives better bounds.)
- *
- * Arguments:   - poly *p: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
-);
-
-#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
-/*************************************************
- * Name:        poly_invntt_tomont
- *
- * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
- *              of a polynomial in place;
- *              inputs assumed to be in bitreversed order, output in normal
- *              order
- *
- *              The input is assumed to be in bitreversed order, and can
- *              have arbitrary coefficients in int16_t.
- *
- *              The output polynomial is in normal order, and
- *              coefficient-wise bound by INVNTT_BOUND in absolute value.
- *
- * Arguments:   - uint16_t *a: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
-);
-
-#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
-/************************************************************
- * Name: basemul_cached
- *
- * Description: Computes a representative modulo q of
- *              (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536
- *
- *              If b_cached is b1*zeta, this represents the
- *              product of (a0 + a1*X) and (b0 + b1*X) in
- *              Fq[X]/(X^2 - zeta).
- *
- * Arguments: - r: Pointer to output polynomial
- *                   Upon return, coefficients are bound by
- *                   2*MLKEM_Q in absolute value.
- *            - a: Pointer to first input polynomial
- *                   Every coefficient must be in [0..4095]
- *            - b: Pointer to second input polynomial
- *                   Can have arbitrary int16_t coefficients
- *            - b_cached: Some precomputed value, typically derived from
- *                   b1 and a twiddle factor. Can be an arbitary int16_t.
- ************************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-__contract__(
-  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
-  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
-  assigns(memory_slice(r, 2 * sizeof(int16_t)))
-  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
-);
-
-#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/params.h
index 57ea4c8ba..7f6c12625 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/params.h
@@ -18,6 +18,7 @@
 #define MLKEM_N 256
 #define MLKEM_Q 3329
 #define UINT12_LIMIT 4096
+#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
 
 #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */
 #define MLKEM_SSBYTES 32  /* size in bytes of shared key */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.c
index 7483ebf6d..e8a2e2c6e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.c
@@ -8,388 +8,246 @@
 #include <stdint.h>
 #include <string.h>
 #include "arith_backend.h"
-#include "cbd.h"
 #include "cbmc.h"
 #include "debug.h"
 #include "fips202x4.h"
-#include "ntt.h"
 #include "poly.h"
-#include "reduce.h"
+#include "sampling.h"
 #include "symmetric.h"
 #include "verify.h"
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j <= MLKEM_N / 4))
-  {
-    unsigned k;
-    uint16_t t[4];
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(forall(r, 0, k, t[r] < (1u << 10))))
-    {
-      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
-    }
-
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 10-bit in size.
-     */
-    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
-    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
-    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
-    r[5 * j + 4] = (t[3] >> 2);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
-  {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
+#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
+#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
+#define fqmul MLKEM_NAMESPACE(fqmul)
+#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
+#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
+#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
+#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
+#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
+#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        cast_uint16_to_int16
+ *
+ * Description: Cast uint16 value to int16
+ *
+ * Returns:
+ *   input x in     0 .. 32767: returns value unchanged
+ *   input x in 32768 .. 65535: returns (x - 65536)
+ **************************************************/
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+ALWAYS_INLINE
+static INLINE int16_t cast_uint16_to_int16(uint16_t x)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 4)
-    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[4];
-    uint8_t const *base = &a[5 * j];
-
-    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
-    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
-    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
-
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  /*
+   * PORTABILITY: This relies on uint16_t -> int16_t
+   * being implemented as the inverse of int16_t -> uint16_t,
+   * which is implementation-defined (C99 6.3.1.3 (3))
+   * CBMC (correctly) fails to prove this conversion is OK,
+   * so we have to suppress that check here
+   */
+  return (int16_t)x;
 }
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+/*************************************************
+ * Name:        montgomery_reduce_generic
+ *
+ * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
+ *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
+ *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
+ *
+ **************************************************/
+ALWAYS_INLINE
+static INLINE int16_t montgomery_reduce_generic(int32_t a)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
+  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 32)))
-    {
-      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
-    }
+  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
+  const uint16_t a_reduced = a & UINT16_MAX;
+  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
 
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC, and use array indexing into
-     * r rather than pointer-arithmetic to simplify verification
-     */
-    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
-    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
-    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
-    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
-    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
-  }
-}
+  /* Lift to signed canonical representative mod 2^16. */
+  const int16_t t = cast_uint16_to_int16(a_inverted);
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  int32_t r = a - ((int32_t)t * MLKEM_Q);
+  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
 
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j <= MLKEM_N / 8))
-  {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
-    {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
-    }
+  /*
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  r = r >> 16;
+  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
+   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
+   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *
+   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
+   */
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
-  }
+  return (int16_t)r;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+/*************************************************
+ * Name:        montgomery_reduce
+ *
+ * Description: Montgomery reduction
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q,
+ *              smaller than 2 * q in absolute value.
+ **************************************************/
+static INLINE int16_t montgomery_reduce(int32_t a)
+__contract__(
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
+  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
+)
 {
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    uint8_t t[8];
-    const unsigned offset = i * 5;
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC and unwind loop for ease
-     * of proof.
-     */
-
-    /*
-     * Decompress 5 8-bit bytes (so 40 bits) into
-     * 8 5-bit values stored in t[]
-     */
-    t[0] = 0x1F & (a[offset + 0] >> 0);
-    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
-    t[2] = 0x1F & (a[offset + 1] >> 2);
-    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
-    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
-    t[5] = 0x1F & (a[offset + 3] >> 1);
-    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
-    t[7] = 0x1F & (a[offset + 4] >> 3);
-
-    /* and copy to the correct slice in r[] */
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(j <= 8 && i <= MLKEM_N / 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  int16_t res;
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
+
+  res = montgomery_reduce_generic(a);
+  /* Bounds:
+   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
+   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
+   *        < 2 * MLKEM_Q */
+
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
+  return res;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+    !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        fqmul
+ *
+ * Description: Montgomery multiplication modulo q=3329
+ *
+ * Arguments:   - int16_t a: first factor
+ *                  Can be any int16_t.
+ *              - int16_t b: second factor.
+ *                  Must be signed canonical (abs value <(q+1)/2)
+ *
+ * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
+ * smaller than q in absolute value.
+ *
+ **************************************************/
+static INLINE int16_t fqmul(int16_t a, int16_t b)
+__contract__(
+  requires(b > -HALF_Q)
+  requires(b < HALF_Q)
+  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
+)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
-  }
+  int16_t res;
+  debug_assert_abs_bound(&b, 1, HALF_Q);
+
+  res = montgomery_reduce((int32_t)a * (int32_t)b);
+  /* Bounds:
+   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
+   *        < MLKEM_Q
+   */
 
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
+  return res;
 }
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
-
-#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+          !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+          !defined(MLKEM_USE_NATIVE_NTT) ||                   \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        barrett_reduce
+ *
+ * Description: Barrett reduction; given a 16-bit integer a, computes
+ *              centered representative congruent to a mod q in
+ *              {-(q-1)/2,...,(q-1)/2}
+ *
+ * Arguments:   - int16_t a: input integer to be reduced
+ *
+ * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
+ **************************************************/
+static INLINE int16_t barrett_reduce(int16_t a)
+__contract__(
+  ensures(return_value > -HALF_Q && return_value < HALF_Q)
+)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i <= MLKEM_N / 2))
-  {
-    const uint16_t t0 = a->coeffs[2 * i];
-    const uint16_t t1 = a->coeffs[2 * i + 1];
-    /*
-     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
-     * significant data, so these can be packed into 24 bits or exactly
-     * 3 bytes, as follows.
-     */
-
-    /* Least significant bits 0 - 7 of t0. */
-    r[3 * i + 0] = t0 & 0xFF;
-
-    /*
-     * Most significant bits 8 - 11 of t0 become the least significant
-     * nibble of the second byte. The least significant 4 bits
-     * of t1 become the upper nibble of the second byte.
-     */
-    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+  /*
+   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
+   * multiplier is round_to_nearest(2**26/MLKEM_Q)
+   */
+  const int BPOWER = 26;
+  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
 
-    /* Bits 4 - 11 of t1 become the third byte. */
-    r[3 * i + 2] = t1 >> 4;
-  }
-}
-#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-{
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  poly_tobytes_native(r, a);
-}
-#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+  /*
+   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
+   * above and shift by BPOWER places.
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
 
-#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
-  {
-    const uint8_t t0 = a[3 * i + 0];
-    const uint8_t t1 = a[3 * i + 1];
-    const uint8_t t2 = a[3 * i + 2];
-    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
-    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
-  }
+  /*
+   * t is in -10 .. +10, so we need 32-bit math to
+   * evaluate t * MLKEM_Q and the subsequent subtraction
+   */
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
 
-  /* Note that the coefficients are not canonical */
-  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
-}
-#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  poly_frombytes_native(r, a);
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
-#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
+                           int16_t b_cached)
+__contract__(
+  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
+  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
+  assigns(memory_slice(r, 2 * sizeof(int16_t)))
+  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)))
 {
-  unsigned i;
-#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
-#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
-#endif
+  int32_t t0, t1;
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <  MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      /* Prevent the compiler from recognizing this as a bit selection */
-      uint8_t mask = value_barrier_u8(1u << j);
-      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
-    }
-  }
-  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
-}
+  t0 = (int32_t)a[1] * b_cached;
+  t0 += (int32_t)a[0] * b[0];
+  t1 = (int32_t)a[0] * b[1];
+  t1 += (int32_t)a[1] * b[0];
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* |ti| < 2 * q * 2^15 */
+  r[0] = montgomery_reduce(t0);
+  r[1] = montgomery_reduce(t1);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    msg[i] = 0;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8))
-    {
-      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
-      msg[i] |= t << j;
-    }
-  }
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -434,12 +292,46 @@ void poly_tomont(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
-  poly_tomont_native(r);
+  poly_tomont_native(r->coeffs);
   debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
 #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/************************************************************
+ * Name: scalar_signed_to_unsigned_q
+ *
+ * Description: converts signed polynomial coefficient
+ *              from signed (-3328 .. 3328) form to
+ *              unsigned form (0 .. 3328).
+ *
+ * Note: Cryptographic constant time implementation
+ *
+ * Examples:       0 -> 0
+ *                 1 -> 1
+ *              3328 -> 3328
+ *                -1 -> 3328
+ *                -2 -> 3327
+ *             -3328 -> 1
+ *
+ * Arguments: c: signed coefficient to be converted
+ ************************************************************/
+static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
+__contract__(
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
+  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
+{
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
+  /* Add Q if c is negative, but in constant time */
+  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
+
+  /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
+  return (uint16_t)c;
+}
+
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
@@ -461,7 +353,7 @@ void poly_reduce(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
-  poly_reduce_native(r);
+  poly_reduce_native(r->coeffs);
   debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
@@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
-  poly_mulcache_compute_native(x, a);
+  poly_mulcache_compute_native(x->coeffs, a->coeffs);
   /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 
+#if !defined(MLKEM_USE_NATIVE_NTT)
+/*
+ * Computes a block CT butterflies with a fixed twiddle factor,
+ * using Montgomery multiplication.
+ * Parameters:
+ * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
+ * - root: Twiddle factor to use for the butterfly. This must be in
+ *         Montgomery form and signed canonical.
+ * - start: Offset to the beginning of the butterfly block
+ * - len: Index difference between coefficients subject to a butterfly
+ * - bound: Ghost variable describing coefficient bound: Prior to `start`,
+ *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
+ *          they must be bound by `bound`.
+ * When this function returns, output coefficients in the index range
+ * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
+ * Example:
+ * - start=8, len=4
+ *   This would compute the following four butterflies
+ *          8     --    12
+ *             9    --     13
+ *                10   --     14
+ *                   11   --     15
+ * - start=4, len=2
+ *   This would compute the following two butterflies
+ *          4 -- 6
+ *             5 -- 7
+ */
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
+__contract__(
+  requires(start < MLKEM_N)
+  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
+  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
+  requires(-HALF_Q < zeta && zeta < HALF_Q)
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
+  requires(array_abs_bound(r, start, MLKEM_N, bound))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
+  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
+{
+  /* `bound` is a ghost variable only needed in the CBMC specification */
+  unsigned j;
+  ((void)bound);
+  for (j = start; j < start + len; j++)
+  __loop__(
+    invariant(start <= j && j <= start + len)
+    /*
+     * Coefficients are updated in strided pairs, so the bounds for the
+     * intermediate states alternate twice between the old and new bound
+     */
+    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j,           start + len, bound))
+    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
+  {
+    int16_t t;
+    t = fqmul(r[j + len], zeta);
+    r[j + len] = r[j] - t;
+    r[j] = r[j] + t;
+  }
+}
+
+/*
+ *Compute one layer of forward NTT
+ * Parameters:
+ * - r: Pointer to base of polynomial
+ * - len: Stride of butterflies in this layer.
+ * - layer: Ghost variable indicating which layer is being applied.
+ *          Must match `len` via `len == MLKEM_N >> layer`.
+ * Note: `len` could be dropped and computed in the function, but
+ *   we are following the structure of the reference NTT from the
+ *   official Kyber implementation here, merely adding `layer` as
+ *   a ghost variable for the specifications.
+ */
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
+  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable only needed in the CBMC specification */
+  ((void)layer);
+  /* Twiddle factors for layer n start at index 2^(layer-1) */
+  k = MLKEM_N / (2 * len);
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
+    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
+  {
+    int16_t zeta = zetas[k++];
+    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
+  }
+}
+
+/*
+ * Compute full forward NTT
+ * NOTE: This particular implementation satisfies a much tighter
+ * bound on the output coefficients (5*q) than the contractual one (8*q),
+ * but this is not needed in the calling code. Should we change the
+ * base multiplication strategy to require smaller NTT output bounds,
+ * the proof may need strengthening.
+ */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  unsigned len, layer;
+  int16_t *r;
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  r = p->coeffs;
+
+  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
+  __loop__(
+    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
+    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
+  {
+    ntt_layer(r, len, layer);
+  }
+
+  /* Check the stronger bound */
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_NTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  ntt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if !defined(MLKEM_USE_NATIVE_INTT)
+
+/* Compute one layer of inverse NTT */
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
+  requires(len == (1 << (8 - layer)))
+  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable used only in the specification */
+  ((void)layer);
+  k = MLKEM_N / len - 1;
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+    invariant(start <= MLKEM_N && k <= 127)
+    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
+    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
+  {
+    unsigned j;
+    int16_t zeta = zetas[k--];
+    for (j = start; j < start + len; j++)
+    __loop__(
+      invariant(start <= j && j <= start + len)
+      invariant(start <= MLKEM_N && k <= 127)
+      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+    {
+      int16_t t = r[j];
+      r[j] = barrett_reduce(t + r[j + len]);
+      r[j + len] = r[j + len] - t;
+      r[j + len] = fqmul(r[j + len], zeta);
+    }
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  /*
+   * Scale input polynomial to account for Montgomery factor
+   * and NTT twist. This also brings coefficients down to
+   * absolute value < MLKEM_Q.
+   */
+  unsigned j, len, layer;
+  const int16_t f = 1441;
+  int16_t *r = p->coeffs;
+
+  for (j = 0; j < MLKEM_N; j++)
+  __loop__(
+    invariant(j <= MLKEM_N)
+    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
+  {
+    r[j] = fqmul(r[j], f);
+  }
+
+  /* Run the invNTT layers */
+  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
+  __loop__(
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+  {
+    invntt_layer(p->coeffs, len, layer);
+  }
+
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_INTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  intt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_INTT */
+
 #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.h
index 6a14c785d..cb0d67c1a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly.h
@@ -9,7 +9,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "reduce.h"
+#include "debug.h"
 #include "verify.h"
 
 /* Absolute exclusive upper bound for the output of the inverse NTT */
@@ -18,6 +18,9 @@
 /* Absolute exclusive upper bound for the output of the forward NTT */
 #define NTT_BOUND (8 * MLKEM_Q)
 
+#define zetas MLKEM_NAMESPACE(zetas)
+extern const int16_t zetas[128];
+
 /*
  * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
  * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
@@ -38,520 +41,6 @@ typedef struct
   int16_t coeffs[MLKEM_N >> 1];
 } poly_mulcache;
 
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
-#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
-#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
-#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
-#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
-#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
-#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
-#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
-#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
-#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
-/* End of static namespacing */
-
-/************************************************************
- * Name: scalar_compress_d1
- *
- * Description: Computes round(u * 2 / q)
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 1.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d1(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 2)
-  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
-{
-  uint32_t d0 = u << 1;
-  d0 *= 645083;
-  d0 += 1u << 30;
-  d0 >>= 31;
-  return d0;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_compress_d4
- *
- * Description: Computes round(u * 16 / q) % 16
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d4(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 16)
-  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
-{
-  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
-  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d4
- *
- * Description: Computes round(u * q / 16)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d4(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 16)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 8) / 16; }
-
-/************************************************************
- * Name: scalar_compress_d5
- *
- * Description: Computes round(u * 32 / q) % 32
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d5(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 32)
-  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
-{
-  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
-  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d5
- *
- * Description: Computes round(u * q / 32)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 32
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d5(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 32)
-  ensures(return_value <= MLKEM_Q - 1)
-) { return ((u * MLKEM_Q) + 16) / 32; }
-
-/************************************************************
- * Name: scalar_compress_d10
- *
- * Description: Computes round(u * 2**10 / q) % 2**10
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d10(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 10))
-  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
-{
-  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x3FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d10
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d10(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 1024)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 512) / 1024; }
-
-/************************************************************
- * Name: scalar_compress_d11
- *
- * Description: Computes round(u * 2**11 / q) % 2**11
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 11.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d11(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 11))
-  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
-{
-  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x7FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d11
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d11(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 2048)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 1024) / 2048; }
-
-/************************************************************
- * Name: scalar_signed_to_unsigned_q
- *
- * Description: converts signed polynomial coefficient
- *              from signed (-3328 .. 3328) form to
- *              unsigned form (0 .. 3328).
- *
- * Note: Cryptographic constant time implementation
- *
- * Examples:       0 -> 0
- *                 1 -> 1
- *              3328 -> 3328
- *                -1 -> 3328
- *                -2 -> 3327
- *             -3328 -> 1
- *
- * Arguments: c: signed coefficient to be converted
- ************************************************************/
-static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
-__contract__(
-  requires(c > -MLKEM_Q && c < MLKEM_Q)
-  ensures(return_value >= 0 && return_value < MLKEM_Q)
-  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
-{
-  debug_assert_abs_bound(&c, 1, MLKEM_Q);
-
-  /* Add Q if c is negative, but in constant time */
-  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
-
-  /* and therefore cast to uint16_t is safe. */
-  debug_assert_bound(&c, 1, 0, MLKEM_Q);
-  return (uint16_t)c;
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
-    (MLKEM_K == 2 || MLKEM_K == 3)
-#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
-/*************************************************
- * Name:        poly_compress_d4
- *
- * Description: Compression (4 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
-
-#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
-/*************************************************
- * Name:        poly_compress_d10
- *
- * Description: Compression (10 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
-
-#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
-/*************************************************
- * Name:        poly_decompress_d4
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
-
-#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
-/*************************************************
- * Name:        poly_decompress_d10
- *
- * Description: De-serialization and subsequent decompression (10 bits) of a
- *              polynomial; approximate inverse of poly_compress_d10
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
-/*************************************************
- * Name:        poly_compress_d5
- *
- * Description: Compression (5 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
-
-#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
-/*************************************************
- * Name:        poly_compress_d11
- *
- * Description: Compression (11 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
-
-#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
-/*************************************************
- * Name:        poly_decompress_d5
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
-
-#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
-/*************************************************
- * Name:        poly_decompress_d11
- *
- * Description: De-serialization and subsequent decompression (11 bits) of a
- *              polynomial; approximate inverse of poly_compress_d11
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
-        */
-
-#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
-/*************************************************
- * Name:        poly_tobytes
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - a: const pointer to input polynomial,
- *                with each coefficient in the range [0,1,..,Q-1]
- *              OUTPUT
- *              - r: pointer to output byte array
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYBYTES))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
-
-
-#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
-/*************************************************
- * Name:        poly_frombytes
- *
- * Description: De-serialization of a polynomial.
- *
- * Arguments:   INPUT
- *              - a: pointer to input byte array
- *                   (of MLKEM_POLYBYTES bytes)
- *              OUTPUT
- *              - r: pointer to output polynomial, with
- *                   each coefficient unsigned and in the range
- *                   0 .. 4095
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
-);
-
-
-#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
-/*************************************************
- * Name:        poly_frommsg
- *
- * Description: Convert 32-byte message to polynomial
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *msg: pointer to input message
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
-
-#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
-/*************************************************
- * Name:        poly_tomsg
- *
- * Description: Convert polynomial to 32-byte message
- *
- * Arguments:   - uint8_t *msg: pointer to output message
- *              - const poly *r: pointer to input polynomial
- *                Coefficients must be unsigned canonical
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(msg))
-);
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -715,4 +204,56 @@ __contract__(
   assigns(object_whole(r))
 );
 
+#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
+/*************************************************
+ * Name:        poly_ntt
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ *              (NOTE: Sometimes the input to the NTT is actually smaller,
+ *               which gives better bounds.)
+ *
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
+);
+
+#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
+/*************************************************
+ * Name:        poly_invntt_tomont
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place;
+ *              inputs assumed to be in bitreversed order, output in normal
+ *              order
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
+);
+
 #endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly_k.c
new file mode 100644
index 000000000..c2d330ea9
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly_k.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "poly_k.h"
+#include <stdint.h>
+#include <string.h>
+#include "arith_backend.h"
+#include "compress.h"
+#include "sampling.h"
+#include "symmetric.h"
+
+#include "debug.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
+#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
+/* End of static namespacing */
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+{
+  unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
+  }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+{
+  unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
+  }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_ntt(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_ntt(&r->vec[i]);
+  }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_invntt_tomont(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_invntt_tomont(&r->vec[i]);
+  }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
+}
+
+#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  unsigned i;
+  poly t;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+
+  poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
+  for (i = 1; i < MLKEM_K; i++)
+  {
+    poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
+                                   &b_cache->vec[i]);
+    poly_add(r, &t);
+  }
+
+  /*
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
+   */
+  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
+}
+#else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+  /* Omitting bounds assertion for cache since native implementations may
+   * decide not to use a mulcache. Note that the C backend implementation
+   * of poly_basemul_montgomery_cached() does still include the check. */
+  polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a,
+                                               (const int16_t *)b,
+                                               (const int16_t *)b_cache);
+}
+#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
+{
+  polyvec_mulcache b_cache;
+  polyvec_mulcache_compute(&b_cache, b);
+  polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_mulcache_compute(&x->vec[i], &a->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_reduce(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_reduce(&r->vec[i]);
+  }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_add(polyvec *r, const polyvec *b)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_add(&r->vec[i], &b->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tomont(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tomont(&r->vec[i]);
+  }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+}
+
+
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta1(poly *r,
+                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
+)
+{
+#if MLKEM_ETA1 == 2
+  poly_cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  poly_cbd3(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA1"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
+  memcpy(extkey0, seed, MLKEM_SYMBYTES);
+  memcpy(extkey1, seed, MLKEM_SYMBYTES);
+  memcpy(extkey2, seed, MLKEM_SYMBYTES);
+  memcpy(extkey3, seed, MLKEM_SYMBYTES);
+  extkey0[MLKEM_SYMBYTES] = nonce0;
+  extkey1[MLKEM_SYMBYTES] = nonce1;
+  extkey2[MLKEM_SYMBYTES] = nonce2;
+  extkey3[MLKEM_SYMBYTES] = nonce3;
+  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
+  poly_cbd_eta1(r0, buf0);
+  poly_cbd_eta1(r1, buf1);
+  poly_cbd_eta1(r2, buf2);
+  poly_cbd_eta1(r3, buf3);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+}
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+/*************************************************
+ * Name:        poly_cbd_eta2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta2(poly *r,
+                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
+{
+#if MLKEM_ETA2 == 2
+  poly_cbd2(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA2"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, seed, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+  prf_eta2(buf, extkey);
+
+  poly_cbd_eta2(r, buf);
+
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+}
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+
+#if MLKEM_K == 2
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+  prf_eta1(buf1[0], extkey[0]);
+  prf_eta1(buf1[1], extkey[1]);
+  prf_eta2(buf2[0], extkey[2]);
+  prf_eta2(buf2[1], extkey[3]);
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
+}
+#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly_k.h
new file mode 100644
index 000000000..0aea95912
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/poly_k.h
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef POLY_K_H
+#define POLY_K_H
+
+#include <stdint.h>
+#include "common.h"
+#include "compress.h"
+#include "poly.h"
+
+#define polyvec MLKEM_NAMESPACE_K(polyvec)
+typedef struct
+{
+  poly vec[MLKEM_K];
+} ALIGN polyvec;
+
+#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
+typedef struct
+{
+  poly_mulcache vec[MLKEM_K];
+} polyvec_mulcache;
+
+#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
+{
+#if MLKEM_DU == 10
+  poly_compress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_compress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *              polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_du(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DU == 10
+  poly_decompress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_decompress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r)))
+{
+#if MLKEM_DV == 4
+  poly_compress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_compress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+
+#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_dv(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DV == 4
+  poly_decompress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_decompress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
+/*************************************************
+ * Name:        polyvec_compress_du
+ *
+ * Description: Compress and serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ *              - const polyvec *a: pointer to input vector of polynomials.
+ *                                  Coefficients must be unsigned canonical,
+ *                                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(forall(k0, 0, MLKEM_K,
+         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
+/*************************************************
+ * Name:        polyvec_decompress_du
+ *
+ * Description: De-serialize and decompress vector of polynomials;
+ *              approximate inverse of polyvec_compress_du
+ *
+ * Arguments:   - polyvec *r:       pointer to output vector of polynomials.
+ *                Output will have coefficients normalized to [0,..,q-1].
+ *              - const uint8_t *a: pointer to input byte array
+ *                                  (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(k0, 0, MLKEM_K,
+         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+);
+
+#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
+/*************************************************
+ * Name:        polyvec_tobytes
+ *
+ * Description: Serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECBYTES)
+ *              - const polyvec *a: pointer to input vector of polynomials
+ *                  Each polynomial must have coefficients in [0,..,q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+__contract__(
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
+  requires(forall(k0, 0, MLKEM_K,
+         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
+/*************************************************
+ * Name:        polyvec_frombytes
+ *
+ * Description: De-serialize vector of polynomials;
+ *              inverse of polyvec_tobytes
+ *
+ * Arguments:   - const polyvec *a: pointer to output vector of polynomials
+ *                 (of length MLKEM_POLYVECBYTES). Output will have coefficients
+ *                 normalized in [0..4095].
+ *              - uint8_t *r: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
+  assigns(object_whole(r))
+  ensures(forall(k0, 0, MLKEM_K,
+        array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
+);
+
+#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
+/*************************************************
+ * Name:        polyvec_ntt
+ *
+ * Description: Apply forward NTT to all elements of a vector of polynomials.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_ntt(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+  assigns(object_whole(r))
+  ensures(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
+);
+
+#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
+/*************************************************
+ * Name:        polyvec_invntt_tomont
+ *
+ * Description: Apply inverse NTT to all elements of a vector of polynomials
+ *              and multiply by Montgomery factor 2^16
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_invntt_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND)))
+);
+
+#define polyvec_basemul_acc_montgomery \
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery
+ *
+ * Description: Multiply elements of a and b in NTT domain, accumulate into r,
+ *              and multiply by 2^-16.
+ *
+ * Arguments: - poly *r: pointer to output polynomial
+ *            - const polyvec *a: pointer to first input vector of polynomials
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(forall(k1, 0, MLKEM_K,
+    array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+
+#define polyvec_basemul_acc_montgomery_cached \
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery_cached
+ *
+ * Description: Scalar product of two vectors of polynomials in NTT domain,
+ *              using mulcache for second operand.
+ *
+ *              Bounds:
+ *              - Every coefficient of a is assumed to be in [0..4095]
+ *              - No bounds guarantees for the coefficients in the result.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const polyvec *a: pointer to first input polynomial vector
+ *              - const polyvec *b: pointer to second input polynomial vector
+ *              - const polyvec_mulcache *b_cache: pointer to mulcache
+ *                  for second input polynomial vector. Can be computed
+ *                  via polyvec_mulcache_compute().
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache)))
+  requires(forall(k1, 0, MLKEM_K,
+     array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
+/************************************************************
+ * Name: polyvec_mulcache_compute
+ *
+ * Description: Computes the mulcache for a vector of polynomials in NTT domain
+ *
+ *              The mulcache of a degree-2 polynomial b := b0 + b1*X
+ *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
+ *              computing products of b in Fq[X]/(X^2-zeta).
+ *
+ *              The mulcache of a polynomial in NTT domain -- which is
+ *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
+ *              for varying zeta, is the 128-tuple of mulcaches of those
+ *              polynomials.
+ *
+ *              The mulcache of a vector of polynomials is the vector
+ *              of mulcaches of its entries.
+ *
+ * Arguments: - x: Pointer to mulcache to be populated
+ *            - a: Pointer to input polynomial vector
+ ************************************************************/
+/*
+ * NOTE: The default C implementation of this function populates
+ * the mulcache with values in (-q,q), but this is not needed for the
+ * higher level safety proofs, and thus not part of the spec.
+ */
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+__contract__(
+  requires(memory_no_alias(x, sizeof(polyvec_mulcache)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  assigns(object_whole(x))
+);
+
+#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
+/*************************************************
+ * Name:        polyvec_reduce
+ *
+ * Description: Applies Barrett reduction to each coefficient
+ *              of each element of a vector of polynomials;
+ *              for details of the Barrett reduction see comments in reduce.c
+ *
+ * Arguments:   - polyvec *r: pointer to input/output polynomial
+ **************************************************/
+/*
+ * NOTE: The semantics of polyvec_reduce() is different in
+ *       the reference implementation, which requires
+ *       signed canonical output data. Unsigned canonical
+ *       outputs are better suited to the only remaining
+ *       use of poly_reduce() in the context of (de)serialization.
+ */
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_reduce(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(k0, 0, MLKEM_K,
+    array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+);
+
+#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
+/*************************************************
+ * Name:        polyvec_add
+ *
+ * Description: Add vectors of polynomials
+ *
+ * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be
+ *              added to
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ *
+ * The coefficients of r and b must be so that the addition does
+ * not overflow. Otherwise, the behaviour of this function is undefined.
+ *
+ * The coefficients returned in *r are in int16_t which is sufficient
+ * to prove type-safety of calling units. Therefore, no stronger
+ * ensures clause is required on this function.
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_add(polyvec *r, const polyvec *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(forall(j0, 0, MLKEM_K,
+          forall(k0, 0, MLKEM_N,
+            (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
+  requires(forall(j1, 0, MLKEM_K,
+          forall(k1, 0, MLKEM_N,
+            (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
+/*************************************************
+ * Name:        polyvec_tomont
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              vector from normal domain to Montgomery domain
+ *
+ *              Bounds: Output < q in absolute value.
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(memory_slice(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(j, 0, MLKEM_K,
+    array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+);
+
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+);
+#endif /* MLKEM_K == 2 */
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.c
deleted file mode 100644
index 50ea1c34a..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.c
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "polyvec.h"
-#include <stdint.h>
-#include <string.h>
-#include "arith_backend.h"
-#include "cbd.h"
-#include "ntt.h"
-#include "poly.h"
-#include "symmetric.h"
-
-#include "debug.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
-#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
-/* End of static namespacing */
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
-                         const polyvec *a)
-{
-  unsigned i;
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_decompress_du(polyvec *r,
-                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
-  }
-
-  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
-{
-  unsigned i;
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
-  }
-
-  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_ntt(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_ntt(&r->vec[i]);
-  }
-
-  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_invntt_tomont(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_invntt_tomont(&r->vec[i]);
-  }
-
-  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
-}
-
-#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
-                                           const polyvec *b,
-                                           const polyvec_mulcache *b_cache)
-{
-  unsigned i;
-  poly t;
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
-
-  poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
-  for (i = 1; i < MLKEM_K; i++)
-  {
-    poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
-                                   &b_cache->vec[i]);
-    poly_add(r, &t);
-  }
-
-  /*
-   * This bound is true for the C implementation, but not needed
-   * in the higher level bounds reasoning. It is thus omitted
-   * them from the spec to not unnecessarily constrain native
-   * implementations, but checked here nonetheless.
-   */
-  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
-}
-#else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
-                                           const polyvec *b,
-                                           const polyvec_mulcache *b_cache)
-{
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
-  /* Omitting bounds assertion for cache since native implementations may
-   * decide not to use a mulcache. Note that the C backend implementation
-   * of poly_basemul_montgomery_cached() does still include the check. */
-  polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
-}
-#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-{
-  polyvec_mulcache b_cache;
-  polyvec_mulcache_compute(&b_cache, b);
-  polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_mulcache_compute(&x->vec[i], &a->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_reduce(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_reduce(&r->vec[i]);
-  }
-
-  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_add(polyvec *r, const polyvec *b)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_add(&r->vec[i], &b->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tomont(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_tomont(&r->vec[i]);
-  }
-
-  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
-}
-
-
-/*************************************************
- * Name:        poly_cbd_eta1
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static INLINE void poly_cbd_eta1(poly *r,
-                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
-)
-{
-#if MLKEM_ETA1 == 2
-  poly_cbd2(r, buf);
-#elif MLKEM_ETA1 == 3
-  poly_cbd3(r, buf);
-#else
-#error "Invalid value of MLKEM_ETA1"
-#endif
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-{
-  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
-  memcpy(extkey0, seed, MLKEM_SYMBYTES);
-  memcpy(extkey1, seed, MLKEM_SYMBYTES);
-  memcpy(extkey2, seed, MLKEM_SYMBYTES);
-  memcpy(extkey3, seed, MLKEM_SYMBYTES);
-  extkey0[MLKEM_SYMBYTES] = nonce0;
-  extkey1[MLKEM_SYMBYTES] = nonce1;
-  extkey2[MLKEM_SYMBYTES] = nonce2;
-  extkey3[MLKEM_SYMBYTES] = nonce3;
-  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
-  poly_cbd_eta1(r0, buf0);
-  poly_cbd_eta1(r1, buf1);
-  poly_cbd_eta1(r2, buf2);
-  poly_cbd_eta1(r3, buf3);
-
-  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
-}
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-/*************************************************
- * Name:        poly_cbd_eta2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA2.
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static INLINE void poly_cbd_eta2(poly *r,
-                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
-{
-#if MLKEM_ETA2 == 2
-  poly_cbd2(r, buf);
-#else
-#error "Invalid value of MLKEM_ETA2"
-#endif
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-{
-  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
-
-  memcpy(extkey, seed, MLKEM_SYMBYTES);
-  extkey[MLKEM_SYMBYTES] = nonce;
-  prf_eta2(buf, extkey);
-
-  poly_cbd_eta2(r, buf);
-
-  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-
-#if MLKEM_K == 2
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
-  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
-  extkey[0][MLKEM_SYMBYTES] = nonce0;
-  extkey[1][MLKEM_SYMBYTES] = nonce1;
-  extkey[2][MLKEM_SYMBYTES] = nonce2;
-  extkey[3][MLKEM_SYMBYTES] = nonce3;
-
-  prf_eta1(buf1[0], extkey[0]);
-  prf_eta1(buf1[1], extkey[1]);
-  prf_eta2(buf2[0], extkey[2]);
-  prf_eta2(buf2[1], extkey[3]);
-
-  poly_cbd_eta1(r0, buf1[0]);
-  poly_cbd_eta1(r1, buf1[1]);
-  poly_cbd_eta2(r2, buf2[0]);
-  poly_cbd_eta2(r3, buf2[1]);
-
-  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
-  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
-}
-#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.h
deleted file mode 100644
index 8be8579e0..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/polyvec.h
+++ /dev/null
@@ -1,595 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef POLYVEC_H
-#define POLYVEC_H
-
-#include <stdint.h>
-#include "common.h"
-#include "poly.h"
-
-#define polyvec MLKEM_NAMESPACE_K(polyvec)
-typedef struct
-{
-  poly vec[MLKEM_K];
-} ALIGN polyvec;
-
-#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
-typedef struct
-{
-  poly_mulcache vec[MLKEM_K];
-} polyvec_mulcache;
-
-#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
-/*************************************************
- * Name:        poly_compress_du
- *
- * Description: Compression (du bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
-                                    const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
-{
-#if MLKEM_DU == 10
-  poly_compress_d10(r, a);
-#elif MLKEM_DU == 11
-  poly_compress_d11(r, a);
-#else
-#error "Invalid value of MLKEM_DU"
-#endif
-}
-
-#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
-/*************************************************
- * Name:        poly_decompress_du
- *
- * Description: De-serialization and subsequent decompression (du bits) of a
- *              polynomial; approximate inverse of poly_compress_du
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-static INLINE void poly_decompress_du(
-    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-#if MLKEM_DU == 10
-  poly_decompress_d10(r, a);
-#elif MLKEM_DU == 11
-  poly_decompress_d11(r, a);
-#else
-#error "Invalid value of MLKEM_DU"
-#endif
-}
-
-#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
-/*************************************************
- * Name:        poly_compress_dv
- *
- * Description: Compression (dv bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
-                                    const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r)))
-{
-#if MLKEM_DV == 4
-  poly_compress_d4(r, a);
-#elif MLKEM_DV == 5
-  poly_compress_d5(r, a);
-#else
-#error "Invalid value of MLKEM_DV"
-#endif
-}
-
-
-#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
-/*************************************************
- * Name:        poly_decompress_dv
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-static INLINE void poly_decompress_dv(
-    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-#if MLKEM_DV == 4
-  poly_decompress_d4(r, a);
-#elif MLKEM_DV == 5
-  poly_decompress_d5(r, a);
-#else
-#error "Invalid value of MLKEM_DV"
-#endif
-}
-
-#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
-/*************************************************
- * Name:        polyvec_compress_du
- *
- * Description: Compress and serialize vector of polynomials
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU)
- *              - const polyvec *a: pointer to input vector of polynomials.
- *                                  Coefficients must be unsigned canonical,
- *                                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
-                         const polyvec *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(forall(k0, 0, MLKEM_K,
-         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-  assigns(object_whole(r))
-);
-
-#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
-/*************************************************
- * Name:        polyvec_decompress_du
- *
- * Description: De-serialize and decompress vector of polynomials;
- *              approximate inverse of polyvec_compress_du
- *
- * Arguments:   - polyvec *r:       pointer to output vector of polynomials.
- *                Output will have coefficients normalized to [0,..,q-1].
- *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU)
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_decompress_du(polyvec *r,
-                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(k0, 0, MLKEM_K,
-         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-);
-
-#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
-/*************************************************
- * Name:        polyvec_tobytes
- *
- * Description: Serialize vector of polynomials
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (needs space for MLKEM_POLYVECBYTES)
- *              - const polyvec *a: pointer to input vector of polynomials
- *                  Each polynomial must have coefficients in [0,..,q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
-__contract__(
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
-  requires(forall(k0, 0, MLKEM_K,
-         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-  assigns(object_whole(r))
-);
-
-#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
-/*************************************************
- * Name:        polyvec_frombytes
- *
- * Description: De-serialize vector of polynomials;
- *              inverse of polyvec_tobytes
- *
- * Arguments:   - const polyvec *a: pointer to output vector of polynomials
- *                 (of length MLKEM_POLYVECBYTES). Output will have coefficients
- *                 normalized in [0..4095].
- *              - uint8_t *r: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
-  assigns(object_whole(r))
-  ensures(forall(k0, 0, MLKEM_K,
-        array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
-);
-
-#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
-/*************************************************
- * Name:        polyvec_ntt
- *
- * Description: Apply forward NTT to all elements of a vector of polynomials.
- *
- *              The input is assumed to be in normal order and
- *              coefficient-wise bound by MLKEM_Q in absolute value.
- *
- *              The output polynomial is in bitreversed order, and
- *              coefficient-wise bound by NTT_BOUND in absolute value.
- *
- * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_ntt(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  requires(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
-  assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
-);
-
-#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
-/*************************************************
- * Name:        polyvec_invntt_tomont
- *
- * Description: Apply inverse NTT to all elements of a vector of polynomials
- *              and multiply by Montgomery factor 2^16
- *
- *              The input is assumed to be in bitreversed order, and can
- *              have arbitrary coefficients in int16_t.
- *
- *              The output polynomial is in normal order, and
- *              coefficient-wise bound by INVNTT_BOUND in absolute value.
- *
- *
- * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_invntt_tomont(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND)))
-);
-
-#define polyvec_basemul_acc_montgomery \
-  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
-/*************************************************
- * Name:        polyvec_basemul_acc_montgomery
- *
- * Description: Multiply elements of a and b in NTT domain, accumulate into r,
- *              and multiply by 2^-16.
- *
- * Arguments: - poly *r: pointer to output polynomial
- *            - const polyvec *a: pointer to first input vector of polynomials
- *            - const polyvec *b: pointer to second input vector of polynomials
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(memory_no_alias(b, sizeof(polyvec)))
-  requires(forall(k1, 0, MLKEM_K,
-    array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
-  assigns(memory_slice(r, sizeof(poly)))
-);
-
-
-#define polyvec_basemul_acc_montgomery_cached \
-  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
-/*************************************************
- * Name:        polyvec_basemul_acc_montgomery_cached
- *
- * Description: Scalar product of two vectors of polynomials in NTT domain,
- *              using mulcache for second operand.
- *
- *              Bounds:
- *              - Every coefficient of a is assumed to be in [0..4095]
- *              - No bounds guarantees for the coefficients in the result.
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const polyvec *a: pointer to first input polynomial vector
- *              - const polyvec *b: pointer to second input polynomial vector
- *              - const polyvec_mulcache *b_cache: pointer to mulcache
- *                  for second input polynomial vector. Can be computed
- *                  via polyvec_mulcache_compute().
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
-                                           const polyvec *b,
-                                           const polyvec_mulcache *b_cache)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(memory_no_alias(b, sizeof(polyvec)))
-  requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache)))
-  requires(forall(k1, 0, MLKEM_K,
-     array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
-  assigns(memory_slice(r, sizeof(poly)))
-);
-
-#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
-/************************************************************
- * Name: polyvec_mulcache_compute
- *
- * Description: Computes the mulcache for a vector of polynomials in NTT domain
- *
- *              The mulcache of a degree-2 polynomial b := b0 + b1*X
- *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
- *              computing products of b in Fq[X]/(X^2-zeta).
- *
- *              The mulcache of a polynomial in NTT domain -- which is
- *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
- *              for varying zeta, is the 128-tuple of mulcaches of those
- *              polynomials.
- *
- *              The mulcache of a vector of polynomials is the vector
- *              of mulcaches of its entries.
- *
- * Arguments: - x: Pointer to mulcache to be populated
- *            - a: Pointer to input polynomial vector
- ************************************************************/
-/*
- * NOTE: The default C implementation of this function populates
- * the mulcache with values in (-q,q), but this is not needed for the
- * higher level safety proofs, and thus not part of the spec.
- */
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
-__contract__(
-  requires(memory_no_alias(x, sizeof(polyvec_mulcache)))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  assigns(object_whole(x))
-);
-
-#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
-/*************************************************
- * Name:        polyvec_reduce
- *
- * Description: Applies Barrett reduction to each coefficient
- *              of each element of a vector of polynomials;
- *              for details of the Barrett reduction see comments in reduce.c
- *
- * Arguments:   - polyvec *r: pointer to input/output polynomial
- **************************************************/
-/*
- * NOTE: The semantics of polyvec_reduce() is different in
- *       the reference implementation, which requires
- *       signed canonical output data. Unsigned canonical
- *       outputs are better suited to the only remaining
- *       use of poly_reduce() in the context of (de)serialization.
- */
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_reduce(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(k0, 0, MLKEM_K,
-    array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-);
-
-#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
-/*************************************************
- * Name:        polyvec_add
- *
- * Description: Add vectors of polynomials
- *
- * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be
- *              added to
- *            - const polyvec *b: pointer to second input vector of polynomials
- *
- * The coefficients of r and b must be so that the addition does
- * not overflow. Otherwise, the behaviour of this function is undefined.
- *
- * The coefficients returned in *r are in int16_t which is sufficient
- * to prove type-safety of calling units. Therefore, no stronger
- * ensures clause is required on this function.
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_add(polyvec *r, const polyvec *b)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  requires(memory_no_alias(b, sizeof(polyvec)))
-  requires(forall(j0, 0, MLKEM_K,
-          forall(k0, 0, MLKEM_N,
-            (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
-  requires(forall(j1, 0, MLKEM_K,
-          forall(k1, 0, MLKEM_N,
-            (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
-  assigns(object_whole(r))
-);
-
-#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
-/*************************************************
- * Name:        polyvec_tomont
- *
- * Description: Inplace conversion of all coefficients of a polynomial
- *              vector from normal domain to Montgomery domain
- *
- *              Bounds: Output < q in absolute value.
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tomont(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(memory_slice(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-    array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
-);
-
-#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and nonces, with output polynomials close to centered binomial distribution
- * with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-/* Depending on MLKEM_K, the pointers passed to this function belong
-   to the same objects, so we cannot use memory_no_alias for r0-r3.
-
-   NOTE: Somehow it is important to use memory_no_alias() first in the
-         conjunctions defining each case.
-*/
-#if MLKEM_K == 2
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
-    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 4
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case B: r0, r1, r2, r3 consecutive */
-    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 3
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case C: r0, r1, r2 consecutive */
- (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
-  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#endif /* MLKEM_K */
-
-#if MLKEM_ETA1 == MLKEM_ETA2
-/*
- * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
- * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
- * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
- */
-#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
-#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
-/*************************************************
- * Name:        poly_getnoise_eta2
- *
- * Description: Sample a polynomial deterministically from a seed and a nonce,
- *              with output polynomial close to centered binomial distribution
- *              with parameter MLKEM_ETA2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1122_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and a nonces, with output polynomials close to centered binomial
- * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-__contract__(
-  requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
-  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
-     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
-);
-#endif /* MLKEM_K == 2 */
-
-#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/reduce.h
deleted file mode 100644
index b432a4201..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/reduce.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "common.h"
-#include "debug.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
-#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
-#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
-#define fqmul MLKEM_NAMESPACE(fqmul)
-#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
-/* End of static namespacing */
-
-#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
-
-/*************************************************
- * Name:        cast_uint16_to_int16
- *
- * Description: Cast uint16 value to int16
- *
- * Returns:
- *   input x in     0 .. 32767: returns value unchanged
- *   input x in 32768 .. 65535: returns (x - 65536)
- **************************************************/
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
-ALWAYS_INLINE
-static INLINE int16_t cast_uint16_to_int16(uint16_t x)
-{
-  /*
-   * PORTABILITY: This relies on uint16_t -> int16_t
-   * being implemented as the inverse of int16_t -> uint16_t,
-   * which is implementation-defined (C99 6.3.1.3 (3))
-   * CBMC (correctly) fails to prove this conversion is OK,
-   * so we have to suppress that check here
-   */
-  return (int16_t)x;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/*************************************************
- * Name:        montgomery_reduce_generic
- *
- * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
- *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *
- * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
- *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
- *
- **************************************************/
-ALWAYS_INLINE
-static INLINE int16_t montgomery_reduce_generic(int32_t a)
-{
-  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
-  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
-
-  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
-  const uint16_t a_reduced = a & UINT16_MAX;
-  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
-
-  /* Lift to signed canonical representative mod 2^16. */
-  const int16_t t = cast_uint16_to_int16(a_inverted);
-
-  int32_t r = a - ((int32_t)t * MLKEM_Q);
-  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
-
-  /*
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  r = r >> 16;
-  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
-   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
-   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *
-   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
-   */
-
-  return (int16_t)r;
-}
-
-/*************************************************
- * Name:        montgomery_reduce
- *
- * Description: Montgomery reduction
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
- *
- * Returns:     integer congruent to a * R^-1 modulo q,
- *              smaller than 2 * q in absolute value.
- **************************************************/
-static INLINE int16_t montgomery_reduce(int32_t a)
-__contract__(
-  requires(a > -(2 * UINT12_LIMIT * 32768))
-  requires(a <  (2 * UINT12_LIMIT * 32768))
-  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
-
-  res = montgomery_reduce_generic(a);
-  /* Bounds:
-   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
-   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
-   *        < 2 * MLKEM_Q */
-
-  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        fqmul
- *
- * Description: Montgomery multiplication modulo q=3329
- *
- * Arguments:   - int16_t a: first factor
- *                  Can be any int16_t.
- *              - int16_t b: second factor.
- *                  Must be signed canonical (abs value <(q+1)/2)
- *
- * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
- * smaller than q in absolute value.
- *
- **************************************************/
-static INLINE int16_t fqmul(int16_t a, int16_t b)
-__contract__(
-  requires(b > -HALF_Q)
-  requires(b < HALF_Q)
-  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&b, 1, HALF_Q);
-
-  res = montgomery_reduce((int32_t)a * (int32_t)b);
-  /* Bounds:
-   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
-   *        < MLKEM_Q
-   */
-
-  debug_assert_abs_bound(&res, 1, MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        barrett_reduce
- *
- * Description: Barrett reduction; given a 16-bit integer a, computes
- *              centered representative congruent to a mod q in
- *              {-(q-1)/2,...,(q-1)/2}
- *
- * Arguments:   - int16_t a: input integer to be reduced
- *
- * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
- **************************************************/
-static INLINE int16_t barrett_reduce(int16_t a)
-__contract__(
-  ensures(return_value > -HALF_Q && return_value < HALF_Q)
-)
-{
-  /*
-   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
-   * multiplier is round_to_nearest(2**26/MLKEM_Q)
-   */
-  const int BPOWER = 26;
-  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
-
-  /*
-   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
-   * above and shift by BPOWER places.
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
-
-  /*
-   * t is in -10 .. +10, so we need 32-bit math to
-   * evaluate t * MLKEM_Q and the subsequent subtraction
-   */
-  int16_t res = (int16_t)(a - t * MLKEM_Q);
-
-  debug_assert_abs_bound(&res, 1, HALF_Q);
-  return res;
-}
-
-#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.c
deleted file mode 100644
index cbbe4407f..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-
-#include "arith_backend.h"
-#include "debug.h"
-#include "fips202.h"
-#include "fips202x4.h"
-#include "rej_uniform.h"
-#include "symmetric.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
-#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
-/* End of static namespacing */
-
-static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
-                                       unsigned int offset, const uint8_t *buf,
-                                       unsigned int buflen)
-__contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
-{
-  unsigned int ctr, pos;
-  uint16_t val0, val1;
-
-  debug_assert_bound(r, offset, 0, MLKEM_Q);
-
-  ctr = offset;
-  pos = 0;
-  /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
-  while (ctr < target && pos + 3 <= buflen)
-  __loop__(
-    invariant(offset <= ctr && ctr <= target && pos <= buflen)
-    invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q)))
-  {
-    val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
-    val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
-    pos += 3;
-
-    if (val0 < MLKEM_Q)
-    {
-      r[ctr++] = val0;
-    }
-    if (ctr < target && val1 < MLKEM_Q)
-    {
-      r[ctr++] = val1;
-    }
-  }
-
-  debug_assert_bound(r, ctr, 0, MLKEM_Q);
-  return ctr;
-}
-
-#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-/*************************************************
- * Name:        rej_uniform
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
- *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
- **************************************************/
-
-/*
- * NOTE: The signature differs from the Kyber reference implementation
- * in that it adds the offset and always expects the base of the target
- * buffer. This avoids shifting the buffer base in the caller, which appears
- * tricky to reason about.
- */
-static unsigned int rej_uniform(int16_t *r, unsigned int target,
-                                unsigned int offset, const uint8_t *buf,
-                                unsigned int buflen)
-__contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
-{
-  return rej_uniform_scalar(r, target, offset, buf, buflen);
-}
-#else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-static unsigned int rej_uniform(int16_t *r, unsigned int target,
-                                unsigned int offset, const uint8_t *buf,
-                                unsigned int buflen)
-{
-  int ret;
-
-  /* Sample from large buffer with full lane as much as possible. */
-  ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
-  if (ret != -1)
-  {
-    unsigned res = offset + (unsigned)ret;
-    debug_assert_bound(r, res, 0, MLKEM_Q);
-    return res;
-  }
-
-  return rej_uniform_scalar(r, target, offset, buf, buflen);
-}
-#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
-  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
-#endif
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
-{
-  /* Temporary buffers for XOF output before rejection sampling */
-  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-
-  /* Tracks the number of coefficients we have already sampled */
-  unsigned int ctr[KECCAK_WAY];
-  xof_x4_ctx statex;
-  unsigned int buflen;
-
-  shake128x4_inc_init(&statex);
-
-  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
-  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
-                MLKEM_SYMBYTES + 2);
-
-  /*
-   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   * This should generate the matrix entries with high probability.
-   */
-  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
-                       &statex);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
-  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
-  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
-  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
-
-  /*
-   * So long as not all matrix entries have been generated, squeeze
-   * one more block a time until we're done.
-   */
-  buflen = XOF_RATE;
-  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
-         ctr[3] < MLKEM_N)
-  __loop__(
-    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
-       object_whole(buf1), object_whole(buf2), object_whole(buf3))
-    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
-    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
-    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
-    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
-    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
-    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
-  {
-    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
-    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
-    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
-    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
-    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
-  }
-
-  xof_x4_release(&statex);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-{
-  xof_ctx state;
-  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  unsigned int ctr, buflen;
-
-  shake128_inc_init(&state);
-
-  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
-
-  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   */
-  /* This should generate the matrix entry with high probability. */
-  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
-
-  /* Squeeze + sample one more block a time until we're done */
-  buflen = XOF_RATE;
-  while (ctr < MLKEM_N)
-  __loop__(
-    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
-    invariant(ctr <= MLKEM_N)
-    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
-  {
-    xof_squeezeblocks(buf, 1, &state);
-    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
-  }
-
-  xof_release(&state);
-}
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
-int empty_cu_rej_uniform;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.h
deleted file mode 100644
index 801287259..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/rej_uniform.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef REJ_UNIFORM_H
-#define REJ_UNIFORM_H
-
-#include <stdint.h>
-#include <stdlib.h>
-#include "cbmc.h"
-#include "common.h"
-#include "poly.h"
-
-#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
-/*************************************************
- * Name:        poly_rej_uniform_x4
- *
- * Description: Generate four polynomials using rejection sampling
- *              on (pseudo-)uniformly random bytes sampled from a seed.
- *
- * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
- *                                     to be sampled.
- *              - uint8_t *seed[4]:    Pointer to array of four pointers
- *                                     pointing to the seed buffers of size
- *                                     MLKEM_SYMBYTES + 2 each.
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
-__contract__(
-  requires(memory_no_alias(vec, sizeof(poly) * 4))
-  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
-  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(vec, sizeof(poly) * 4))
-  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
-
-#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
-/*************************************************
- * Name:        poly_rej_uniform
- *
- * Description: Generate polynomial using rejection sampling
- *              on (pseudo-)uniformly random bytes sampled from a seed.
- *
- * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
- *              - uint8_t *seed:       Pointer to seed buffer of size
- *                                     MLKEM_SYMBYTES + 2 each.
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-__contract__(
-  requires(memory_no_alias(entry, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(entry, sizeof(poly)))
-  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
-
-#endif /* REJ_UNIFORM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/sampling.c
new file mode 100644
index 000000000..98cbdcb74
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/sampling.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
+#include "arith_backend.h"
+#include "debug.h"
+#include "fips202.h"
+#include "fips202x4.h"
+#include "sampling.h"
+#include "symmetric.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
+/* End of static namespacing */
+
+static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
+                                       unsigned int offset, const uint8_t *buf,
+                                       unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
+{
+  unsigned int ctr, pos;
+  uint16_t val0, val1;
+
+  debug_assert_bound(r, offset, 0, MLKEM_Q);
+
+  ctr = offset;
+  pos = 0;
+  /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
+  while (ctr < target && pos + 3 <= buflen)
+  __loop__(
+    invariant(offset <= ctr && ctr <= target && pos <= buflen)
+    invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q)))
+  {
+    val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
+    val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+    pos += 3;
+
+    if (val0 < MLKEM_Q)
+    {
+      r[ctr++] = val0;
+    }
+    if (ctr < target && val1 < MLKEM_Q)
+    {
+      r[ctr++] = val1;
+    }
+  }
+
+  debug_assert_bound(r, ctr, 0, MLKEM_Q);
+  return ctr;
+}
+
+#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * NOTE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
+{
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+{
+  int ret;
+
+  /* Sample from large buffer with full lane as much as possible. */
+  ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
+  if (ret != -1)
+  {
+    unsigned res = offset + (unsigned)ret;
+    debug_assert_bound(r, res, 0, MLKEM_Q);
+    return res;
+  }
+
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
+#endif
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  xof_x4_ctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
+                MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                       &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = XOF_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+  {
+    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  xof_x4_release(&statex);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+{
+  xof_ctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = XOF_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(ctr <= MLKEM_N)
+    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
+  {
+    xof_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
+  }
+
+  xof_release(&state);
+}
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        load32_littleendian
+ *
+ * Description: load 4 bytes into a 32-bit integer
+ *              in little-endian order
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x
+ **************************************************/
+static uint32_t load32_littleendian(const uint8_t x[4])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  r |= (uint32_t)x[3] << 24;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
+  {
+    unsigned j;
+    uint32_t t = load32_littleendian(buf + 4 * i);
+    uint32_t d = t & 0x55555555;
+    d += (t >> 1) & 0x55555555;
+
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
+    {
+      const int16_t a = (d >> (4 * j + 0)) & 0x3;
+      const int16_t b = (d >> (4 * j + 2)) & 0x3;
+      r->coeffs[8 * i + j] = a - b;
+    }
+  }
+}
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+/*************************************************
+ * Name:        load24_littleendian
+ *
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+ **************************************************/
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
+  {
+    unsigned j;
+    const uint32_t t = load24_littleendian(buf + 3 * i);
+    uint32_t d = t & 0x00249249;
+    d += (t >> 1) & 0x00249249;
+    d += (t >> 2) & 0x00249249;
+
+    for (j = 0; j < 4; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 4 && j <= 4)
+      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
+    {
+      const int16_t a = (d >> (6 * j + 0)) & 0x7;
+      const int16_t b = (d >> (6 * j + 3)) & 0x7;
+      r->coeffs[4 * i + j] = a - b;
+    }
+  }
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling)
+int empty_cu_sampling;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/sampling.h
new file mode 100644
index 000000000..cc524e0fc
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/sampling.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef SAMPLING_H
+#define SAMPLING_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "cbmc.h"
+#include "common.h"
+#include "poly.h"
+
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
+/*************************************************
+ * Name:        poly_cbd2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
+/*************************************************
+ * Name:        poly_cbd3
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
+
+#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
+/*************************************************
+ * Name:        poly_rej_uniform_x4
+ *
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
+ *                                     to be sampled.
+ *              - uint8_t *seed[4]:    Pointer to array of four pointers
+ *                                     pointing to the seed buffers of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
+/*************************************************
+ * Name:        poly_rej_uniform
+ *
+ * Description: Generate polynomial using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
+ *              - uint8_t *seed:       Pointer to seed buffer of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+__contract__(
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#endif /* SAMPLING_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c
index 4ef887c62..987f0dce4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c
@@ -10,7 +10,7 @@
 
 #include "common.h"
 #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-#include "ntt.h"
+#include "poly.h"
 
 /*
  * Table of zeta values used in the reference NTT and inverse NTT.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/api.h
deleted file mode 100644
index 792ecb8a4..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/api.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * Native arithmetic interface
- *
- * This header is primarily for documentation purposes.
- * It should not be included by backend implementations.
- *
- * To ensure consistency with backends, the header will be
- * included automatically after inclusion of the active
- * backend, to ensure consistency of function signatures,
- * and run sanity checks.
- */
-#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H
-#error \
-    "The arithmetic backend API `mlkem/native/api.h` "		\
-    "should not be directly included. Please include the relevant "	\
-    "structure headers directly."
-#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
-#define MLKEM_NATIVE_ARITH_NATIVE_API_H
-
-#include <stdint.h>
-#include "poly.h"
-#include "polyvec.h"
-
-/*
- * This is the C<->native interface allowing for the drop-in of
- * native code for performance critical arithmetic components of ML-KEM.
- *
- * A _backend_ is a specific implementation of (part of) this interface.
- *
- * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and
- * implement `static inline xxx(...)` in the profile header.
- *
- * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can
- * be set if there are native implementations for all of NTT, invNTT, and
- * base multiplication, and allows the native implementation to use a
- * custom order of polynomial coefficients in NTT domain -- the use of such
- * custom order is not an implementation-detail since the public matrix
- * is generated in NTT domain. In this case, a permutation function
- * poly_permute_bitrev_to_custom() needs to be provided that permutes
- * polynomials in NTT domain from bitreversed to the custom order.
- */
-
-/*
- * Those functions are meant to be trivial wrappers around the chosen native
- * implementation. The are static inline to avoid unnecessary calls.
- * The macro before each declaration controls whether a native
- * implementation is present.
- */
-
-#if defined(MLKEM_USE_NATIVE_NTT)
-/*************************************************
- * Name:        ntt_native
- *
- * Description: Computes negacyclic number-theoretic transform (NTT) of
- *              a polynomial in place.
- *
- *              The input polynomial is assumed to be in normal order.
- *              The output polynomial is in bitreversed order, or of a
- *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *              for more information.
- *
- * Arguments:   - poly *p: pointer to in/output polynomial
- **************************************************/
-static INLINE void ntt_native(poly *);
-#endif /* MLKEM_USE_NATIVE_NTT */
-
-#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
-/*
- * This must only be set if NTT, invNTT, basemul, mulcache, and
- * to/from byte stream conversions all have native implementations
- * that are adapted to the custom order.
- */
-#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \
-    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) ||                  \
-    !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) ||  \
-    !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) ||                           \
-    !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-#error \
-    "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \
-set if there are native implementations for NTT, invNTT, mulcache, basemul, \
-and to/from bytes conversions."
-#endif
-
-/*************************************************
- * Name:        poly_permute_bitrev_to_custom
- *
- * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined,
- *              convert a polynomial in NTT domain from bitreversed
- *              order to the custom order output by the native NTT.
- *
- *              This must only be defined if there is native code for
- *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
- * Arguments:   - poly *p: pointer to in/output polynomial
- *
- **************************************************/
-static INLINE void poly_permute_bitrev_to_custom(poly *);
-#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
-
-#if defined(MLKEM_USE_NATIVE_INTT)
-/*************************************************
- * Name:        intt_native
- *
- * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
- *              of a polynomial in place.
- *
- *              The input polynomial is in bitreversed order, or of a
- *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *              for more information.
- *              The output polynomial is assumed to be in normal order.
- *
- * Arguments:   - uint16_t *a: pointer to in/output polynomial
- **************************************************/
-static INLINE void intt_native(poly *);
-#endif /* MLKEM_USE_NATIVE_INTT */
-
-#if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
-/*************************************************
- * Name:        poly_reduce_native
- *
- * Description: Applies modular reduction to all coefficients of a polynomial.
- *
- * Arguments:   - poly *r: pointer to input/output polynomial
- **************************************************/
-static INLINE void poly_reduce_native(poly *);
-#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
-
-#if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
-/*************************************************
- * Name:        poly_tomont_native
- *
- * Description: Inplace conversion of all coefficients of a polynomial
- *              from normal domain to Montgomery domain
- *
- * Arguments:   - poly *r: pointer to input/output polynomial
- **************************************************/
-static INLINE void poly_tomont_native(poly *);
-#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
-
-#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
-/*************************************************
- * Name:        poly_mulcache_compute_native
- *
- * Description: Compute multiplication cache for a polynomial
- *              in NTT domain.
- *
- *              The purpose of the multiplication cache is to
- *              cache repeated computations required during a
- *              base multiplication of polynomials in NTT domain.
- *              The structure of the multiplication-cache is
- *              implementation defined.
- *
- * Arguments:   INPUT:
- *              - poly: const pointer to input polynomial.
- *                  This must be in NTT domain and inin bitreversed order, or of
- *                  a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *                  See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *                  for more information.
- *              OUTPUT
- *              - cache: pointer to multiplication cache
- **************************************************/
-static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
-                                                const poly *poly);
-#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
-
-#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
-/*************************************************
- * Name:        poly_mulcache_compute_native
- *
- * Description: Compute multiplication of polynomials in NTT domain.
- *
- * Arguments:   INPUT:
- *              - a: First polynomial operand.
- *                 This must be in NTT domain and inin bitreversed order, or of
- *                 a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *                 See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *                 for more information.
- *              - b: Second polynomial operand.
- *                 As for a.
- *              - b_cache: Multiplication-cache for b.
- *              OUTPUT
- *              - r: Result of the base multiplication. This is again
- *                   in NTT domain, and of the same order as a and b.
- **************************************************/
-static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache);
-#endif
-
-#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
-/*************************************************
- * Name:        poly_tobytes_native
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - a: const pointer to input polynomial,
- *                with each coefficient in the range -Q+1 .. Q-1
- *              OUTPUT
- *              - r: pointer to output byte array
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a);
-#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
-
-#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-/*************************************************
- * Name:        poly_frombytes_native
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - r: pointer to output polynomial in NTT domain
- *              OUTPUT
- *              - a: const pointer to input byte aray
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-static INLINE void poly_frombytes_native(poly *a,
-                                         const uint8_t r[MLKEM_POLYBYTES]);
-#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-
-#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-/*************************************************
- * Name:        rej_uniform_native
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int len:    requested number of 16-bit integers
- *                                     (uniform mod q).
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes.
- *
- * Return -1 if the native implementation does not support the input lengths.
- * Otherwise, returns non-negative number of sampled 16-bit integers (at most
- * len).
- **************************************************/
-static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
-                                     const uint8_t *buf, unsigned int buflen);
-#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/arith_backend.h
index 0543b1bd1..ade31cda1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/arith_backend.h
@@ -17,7 +17,7 @@
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
 #if defined(MLKEM_NATIVE_CHECK_APIS)
-#include "api.h"
+#include "native/api.h"
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.c
deleted file mode 100644
index 1e6b7c5d1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
-
-#include <stdint.h>
-#include "cbd.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
-#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-/* End of static namespacing */
-
-/*************************************************
- * Name:        load32_littleendian
- *
- * Description: load 4 bytes into a 32-bit integer
- *              in little-endian order
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x
- **************************************************/
-static uint32_t load32_littleendian(const uint8_t x[4])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  r |= (uint32_t)x[3] << 24;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
-  {
-    unsigned j;
-    uint32_t t = load32_littleendian(buf + 4 * i);
-    uint32_t d = t & 0x55555555;
-    d += (t >> 1) & 0x55555555;
-
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
-    {
-      const int16_t a = (d >> (4 * j + 0)) & 0x3;
-      const int16_t b = (d >> (4 * j + 2)) & 0x3;
-      r->coeffs[8 * i + j] = a - b;
-    }
-  }
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 4)
-    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
-  {
-    unsigned j;
-    const uint32_t t = load24_littleendian(buf + 3 * i);
-    uint32_t d = t & 0x00249249;
-    d += (t >> 1) & 0x00249249;
-    d += (t >> 2) & 0x00249249;
-
-    for (j = 0; j < 4; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 4 && j <= 4)
-      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
-    {
-      const int16_t a = (d >> (6 * j + 0)) & 0x7;
-      const int16_t b = (d >> (6 * j + 3)) & 0x7;
-      r->coeffs[4 * i + j] = a - b;
-    }
-  }
-}
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
-          3 */
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
-int empty_cu_cbd;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.h
deleted file mode 100644
index 54c1f5b90..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/cbd.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef CBD_H
-#define CBD_H
-
-#include <stdint.h>
-#include "common.h"
-#include "poly.h"
-
-#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
-/*************************************************
- * Name:        poly_cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
-/*************************************************
- * Name:        poly_cbd3
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
-
-#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h
index 4f326333e..62ed53ab1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h
@@ -15,12 +15,19 @@
 #include "sys.h"
 
 /* Include backend metadata */
-#if defined(MLKEM_USE_NATIVE)
-#if defined(MLKEM_NATIVE_ARITH_BACKEND)
-#include MLKEM_NATIVE_ARITH_BACKEND
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#include MLKEM_NATIVE_ARITH_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not.
+#endif
 #endif
-#if defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#include MLKEM_NATIVE_FIPS202_BACKEND
+
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#include MLKEM_NATIVE_FIPS202_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not.
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/compress.c
new file mode 100644
index 000000000..a03fe0ac4
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/compress.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
+#include <stdint.h>
+#include <string.h>
+#include "arith_backend.h"
+#include "cbmc.h"
+#include "compress.h"
+#include "debug.h"
+#include "verify.h"
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
+    {
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
+    }
+
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(invariant(j <= MLKEM_N / 4))
+  {
+    unsigned k;
+    uint16_t t[4];
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(forall(r, 0, k, t[r] < (1u << 10))))
+    {
+      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 10-bit in size.
+     */
+    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
+    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
+    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
+    r[5 * j + 4] = (t[3] >> 2);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  {
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 4)
+    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[4];
+    uint8_t const *base = &a[5 * j];
+
+    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
+    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
+    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
+
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 32)))
+    {
+      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
+    }
+
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC, and use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     */
+    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
+    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
+    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
+  {
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    uint8_t t[8];
+    const unsigned offset = i * 5;
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC and unwind loop for ease
+     * of proof.
+     */
+
+    /*
+     * Decompress 5 8-bit bytes (so 40 bits) into
+     * 8 5-bit values stored in t[]
+     */
+    t[0] = 0x1F & (a[offset + 0] >> 0);
+    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
+    t[2] = 0x1F & (a[offset + 1] >> 2);
+    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
+    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
+    t[5] = 0x1F & (a[offset + 3] >> 1);
+    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
+    t[7] = 0x1F & (a[offset + 4] >> 3);
+
+    /* and copy to the correct slice in r[] */
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(j <= 8 && i <= MLKEM_N / 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(invariant(i <= MLKEM_N / 2))
+  {
+    const uint16_t t0 = a->coeffs[2 * i];
+    const uint16_t t1 = a->coeffs[2 * i + 1];
+    /*
+     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
+     * significant data, so these can be packed into 24 bits or exactly
+     * 3 bytes, as follows.
+     */
+
+    /* Least significant bits 0 - 7 of t0. */
+    r[3 * i + 0] = t0 & 0xFF;
+
+    /*
+     * Most significant bits 8 - 11 of t0 become the least significant
+     * nibble of the second byte. The least significant 4 bits
+     * of t1 become the upper nibble of the second byte.
+     */
+    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+
+    /* Bits 4 - 11 of t1 become the third byte. */
+    r[3 * i + 2] = t1 >> 4;
+  }
+}
+#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  poly_tobytes_native(r, a->coeffs);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
+  {
+    const uint8_t t0 = a[3 * i + 0];
+    const uint8_t t1 = a[3 * i + 1];
+    const uint8_t t2 = a[3 * i + 2];
+    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
+    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+  }
+
+  /* Note that the coefficients are not canonical */
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  poly_frombytes_native(r->coeffs, a);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+{
+  unsigned i;
+#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
+#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
+#endif
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <  MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      /* Prevent the compiler from recognizing this as a bit selection */
+      uint8_t mask = value_barrier_u8(1u << j);
+      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
+    }
+  }
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    msg[i] = 0;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8))
+    {
+      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
+      msg[i] |= t << j;
+    }
+  }
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress)
+int empty_cu_compress;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/compress.h
new file mode 100644
index 000000000..409dbe519
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/compress.h
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef COMPRESS_H
+#define COMPRESS_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+#include "debug.h"
+#include "poly.h"
+#include "verify.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
+#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
+#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
+#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
+#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
+#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
+#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
+#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
+#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
+/* End of static namespacing */
+
+/************************************************************
+ * Name: scalar_compress_d1
+ *
+ * Description: Computes round(u * 2 / q)
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 1.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d1(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 2)
+  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
+{
+  uint32_t d0 = u << 1;
+  d0 *= 645083;
+  d0 += 1u << 30;
+  d0 >>= 31;
+  return d0;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_compress_d4
+ *
+ * Description: Computes round(u * 16 / q) % 16
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d4(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 16)
+  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
+{
+  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
+  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d4
+ *
+ * Description: Computes round(u * q / 16)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d4(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 16)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 8) / 16; }
+
+/************************************************************
+ * Name: scalar_compress_d5
+ *
+ * Description: Computes round(u * 32 / q) % 32
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d5(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 32)
+  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
+{
+  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
+  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d5
+ *
+ * Description: Computes round(u * q / 32)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 32
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d5(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 32)
+  ensures(return_value <= MLKEM_Q - 1)
+) { return ((u * MLKEM_Q) + 16) / 32; }
+
+/************************************************************
+ * Name: scalar_compress_d10
+ *
+ * Description: Computes round(u * 2**10 / q) % 2**10
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d10(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 10))
+  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
+{
+  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x3FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d10
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d10(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 1024)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 512) / 1024; }
+
+/************************************************************
+ * Name: scalar_compress_d11
+ *
+ * Description: Computes round(u * 2**11 / q) % 2**11
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 11.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d11(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 11))
+  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
+{
+  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x7FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d11
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d11(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 2048)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 1024) / 2048; }
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
+/*************************************************
+ * Name:        poly_compress_d4
+ *
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
+
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
+/*************************************************
+ * Name:        poly_decompress_d4
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
+/*************************************************
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
+
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
+/*************************************************
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
+
+#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
+/*************************************************
+ * Name:        poly_tobytes
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range [0,1,..,Q-1]
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYBYTES))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r))
+);
+
+
+#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
+/*************************************************
+ * Name:        poly_frombytes
+ *
+ * Description: De-serialization of a polynomial.
+ *
+ * Arguments:   INPUT
+ *              - a: pointer to input byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ *              OUTPUT
+ *              - r: pointer to output polynomial, with
+ *                   each coefficient unsigned and in the range
+ *                   0 .. 4095
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
+);
+
+
+#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
+/*************************************************
+ * Name:        poly_frommsg
+ *
+ * Description: Convert 32-byte message to polynomial
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *msg: pointer to input message
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
+#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
+/*************************************************
+ * Name:        poly_tomsg
+ *
+ * Description: Convert polynomial to 32-byte message
+ *
+ * Arguments:   - uint8_t *msg: pointer to output message
+ *              - const poly *r: pointer to input polynomial
+ *                Coefficients must be unsigned canonical
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(msg))
+);
+
+#endif /* COMPRESS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h
index fa89370ce..e975ede95 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h
@@ -122,46 +122,87 @@
 /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
- * Name:        MLKEM_USE_NATIVE
+ * Name:        MLKEM_USE_NATIVE_BACKEND_ARITH
  *
- * Description: Determines whether a native backend should
- *              be used, if available.
+ * Description: Determines whether an native arithmetic backend should be used.
+ *
+ *              The arithmetic backend covers performance critical functions
+ *              such as the number-theoretic transform (NTT).
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the arithmetic backend to be use is
+ *              determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(MLKEM_USE_NATIVE)
-/* #define MLKEM_USE_NATIVE */
+#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */
 #endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_ARITH_BACKEND
+ * Name:        MLKEM_NATIVE_ARITH_BACKEND_FILE
  *
  * Description: The arithmetic backend to use.
  *
- *              This must be the filename of an arithmetic backend.
- *              See the existing backends for examples.
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option
+ *              is ignored.
+ *
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must
+ *              either be undefined or the filename of an arithmetic backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND)
-#define MLKEM_NATIVE_ARITH_BACKEND "default.h"
-#endif /* MLKEM_NATIVE_ARITH_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \
+    !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h"
+#endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_FIPS202_BACKEND
+ * Name:        MLKEM_USE_NATIVE_BACKEND_FIPS202
+ *
+ * Description: Determines whether an native FIPS202 backend should be used.
+ *
+ *              The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is
+ *              the performance bottleneck of SHA3 and SHAKE.
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the FIPS202 backend to be use is
+ *              determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */
+#endif
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_FIPS202_BACKEND_FILE
  *
  * Description: The FIPS-202 backend to use.
  *
- *              This must be the filename of an FIPS-202 backend.
+ *              If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must
+ *              either be undefined or the filename of a FIPS202 backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h"
-#endif /* MLKEM_NATIVE_FIPS202_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \
+    !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h"
+#endif
 
 /*************************  Config internals  ********************************/
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/default.h
deleted file mode 100644
index d1e41c52e..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/default.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
-#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
-
-/*
- * Default arithmetic backend
- */
-#include "sys.h"
-
-#ifdef SYS_AARCH64
-/*
- * For AArch64, we currently we have one clean and one opt profile.
- * We default to the opt profile.
- *
- * In the future, this may branch further depending on the microarchitecture.
- */
-#include "aarch64/opt.h"
-#endif /* SYS_AARCH64 */
-
-#ifdef SYS_X86_64_AVX2
-/*
- * For now, there's only one x86_64 profile, based on
- * the AVX2 code from the Kyber repository.
- * https://github.com/pq-crystals/kyber
- */
-#include "x86_64/default.h"
-#endif /* SYS_X86_64 */
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.c
index 0cfcc3e9e..318d0fc77 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.c
@@ -9,11 +9,10 @@
 #include "fips202.h"
 #include "fips202x4.h"
 #include "indcpa.h"
-#include "ntt.h"
 #include "poly.h"
-#include "polyvec.h"
+#include "poly_k.h"
 #include "randombytes.h"
-#include "rej_uniform.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 #include "arith_backend.h"
@@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v,
 #define poly_permute_bitrev_to_custom \
   MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
-static INLINE void poly_permute_bitrev_to_custom(poly *data)
+static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
 __contract__(
   /* We don't specify that this should be a permutation, but only
    * that it does not change the bound established at the end of gen_matrix. */
-  requires(memory_no_alias(data, sizeof(poly)))
-  requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
+  requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
   assigns(memory_slice(data, sizeof(poly)))
-  ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
+  ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
 #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
 
 /* Not static for benchmarking */
@@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
   {
     for (j = 0; j < MLKEM_K; j++)
     {
-      poly_permute_bitrev_to_custom(&a[i].vec[j]);
+      poly_permute_bitrev_to_custom(a[i].vec[j].coeffs);
     }
   }
 }
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.h
index 2c4fda3c4..b4d5985bf 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/indcpa.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "polyvec.h"
+#include "poly_k.h"
 
 #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/README.md
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/README.md
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/README.md
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/clean.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/clean.h
similarity index 90%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/clean.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/clean.h
index 43a401dfc..f124702a4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/clean.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/clean.h
@@ -19,6 +19,6 @@
 /* Filename of the C backend implementation.
  * This is not inlined here because this header is included in assembly
  * files as well. */
-#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/clean_impl.h"
+#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/clean_impl.h"
 
 #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/opt.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/opt.h
similarity index 91%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/opt.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/opt.h
index 04323c3e7..a7217163f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/opt.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/opt.h
@@ -19,6 +19,6 @@
 /* Filename of the C backend implementation.
  * This is not inlined here because this header is included in assembly
  * files as well. */
-#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "aarch64/src/opt_impl.h"
+#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/opt_impl.h"
 
 #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c
index 1e189fd99..b3a6f198f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c
@@ -8,7 +8,7 @@
  *          Do not modify it directly.
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
     defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/arith_native_aarch64.h
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/arith_native_aarch64.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/arith_native_aarch64.h
index fc4e7dd38..a784a3027 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/arith_native_aarch64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/arith_native_aarch64.h
@@ -6,7 +6,7 @@
 #define MLKEM_AARCH64_NATIVE_H
 
 #include <stdint.h>
-#include "common.h"
+#include "../../../common.h"
 
 #define aarch64_ntt_zetas_layer01234 \
   MLKEM_NAMESPACE(aarch64_ntt_zetas_layer01234)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/clean_impl.h
similarity index 58%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/clean_impl.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/clean_impl.h
index 548b1eebb..ded7d067a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/clean_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/clean_impl.h
@@ -12,9 +12,6 @@
 
 #include "arith_native_aarch64.h"
 
-#include "poly.h"
-#include "polyvec.h"
-
 /* Set of primitives that this backend replaces */
 #define MLKEM_USE_NATIVE_NTT
 #define MLKEM_USE_NATIVE_INTT
@@ -25,45 +22,46 @@
 #define MLKEM_USE_NATIVE_POLY_TOBYTES
 #define MLKEM_USE_NATIVE_REJ_UNIFORM
 
-static INLINE void ntt_native(poly *data)
+static INLINE void ntt_native(int16_t data[MLKEM_N])
 {
-  ntt_asm_clean(data->coeffs, aarch64_ntt_zetas_layer01234,
-                aarch64_ntt_zetas_layer56);
+  ntt_asm_clean(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56);
 }
 
-static INLINE void intt_native(poly *data)
+static INLINE void intt_native(int16_t data[MLKEM_N])
 {
-  intt_asm_clean(data->coeffs, aarch64_invntt_zetas_layer01234,
+  intt_asm_clean(data, aarch64_invntt_zetas_layer01234,
                  aarch64_invntt_zetas_layer56);
 }
 
-static INLINE void poly_reduce_native(poly *data)
+static INLINE void poly_reduce_native(int16_t data[MLKEM_N])
 {
-  poly_reduce_asm_clean(data->coeffs);
+  poly_reduce_asm_clean(data);
 }
-static INLINE void poly_tomont_native(poly *data)
+
+static INLINE void poly_tomont_native(int16_t data[MLKEM_N])
 {
-  poly_tomont_asm_clean(data->coeffs);
+  poly_tomont_asm_clean(data);
 }
 
-static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y)
+static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
+                                                const int16_t y[MLKEM_N])
 {
-  poly_mulcache_compute_asm_clean(x->coeffs, y->coeffs,
-                                  aarch64_zetas_mulcache_native,
+  poly_mulcache_compute_asm_clean(x, y, aarch64_zetas_mulcache_native,
                                   aarch64_zetas_mulcache_twisted_native);
 }
+
 static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache)
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)])
 {
-  polyvec_basemul_acc_montgomery_cached_asm_clean(
-      r->coeffs, a->vec[0].coeffs, b->vec[0].coeffs, b_cache->vec[0].coeffs);
+  polyvec_basemul_acc_montgomery_cached_asm_clean(r, a, b, b_cache);
 }
 
 static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a)
+                                       const int16_t a[MLKEM_N])
 {
-  poly_tobytes_asm_clean(r, a->coeffs);
+  poly_tobytes_asm_clean(r, a);
 }
 
 static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/consts.h
similarity index 94%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/consts.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/consts.h
index c40947299..e3ea26a27 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/consts.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/consts.h
@@ -7,7 +7,7 @@
 #define MLKEM_NATIVE_AARCH64_CONSTS
 
 #include <stdint.h>
-#include "common.h"
+#include "../../../common.h"
 
 #define zetas_mulcache_native MLKEM_NAMESPACE(zetas_mulcache_native)
 extern const int16_t zetas_mulcache_native[256];
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S
index b243a569d..28ad38975 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S
@@ -23,7 +23,7 @@
 /// SOFTWARE.
 ///
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
 // Bounds:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S
index c94746e17..857c729cb 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S
@@ -23,7 +23,7 @@
 /// SOFTWARE.
 ///
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
 // Bounds:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S
index cd63cc4d6..30fdc76b0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S
@@ -24,7 +24,7 @@
 /// SOFTWARE.
 ///
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
 // Bounds:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_opt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_opt.S
index 8705615b7..431f9dc6f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_opt.S
@@ -24,7 +24,7 @@
 /// SOFTWARE.
 ///
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
 // Bounds:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/opt_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/opt_impl.h
similarity index 58%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/opt_impl.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/opt_impl.h
index ec1bf6587..eb8e39ed0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/opt_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/opt_impl.h
@@ -10,11 +10,9 @@
 #else
 #define MLKEM_NATIVE_ARITH_PROFILE_IMPL_H
 
+#include "../../../params.h"
 #include "arith_native_aarch64.h"
 
-#include "poly.h"
-#include "polyvec.h"
-
 /* Set of primitives that this backend replaces */
 #define MLKEM_USE_NATIVE_NTT
 #define MLKEM_USE_NATIVE_INTT
@@ -25,45 +23,46 @@
 #define MLKEM_USE_NATIVE_POLY_TOBYTES
 #define MLKEM_USE_NATIVE_REJ_UNIFORM
 
-static INLINE void ntt_native(poly *data)
+static INLINE void ntt_native(int16_t data[MLKEM_N])
 {
-  ntt_asm_opt(data->coeffs, aarch64_ntt_zetas_layer01234,
-              aarch64_ntt_zetas_layer56);
+  ntt_asm_opt(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56);
 }
 
-static INLINE void intt_native(poly *data)
+static INLINE void intt_native(int16_t data[MLKEM_N])
 {
-  intt_asm_opt(data->coeffs, aarch64_invntt_zetas_layer01234,
+  intt_asm_opt(data, aarch64_invntt_zetas_layer01234,
                aarch64_invntt_zetas_layer56);
 }
 
-static INLINE void poly_reduce_native(poly *data)
+static INLINE void poly_reduce_native(int16_t data[MLKEM_N])
 {
-  poly_reduce_asm_opt(data->coeffs);
+  poly_reduce_asm_opt(data);
 }
-static INLINE void poly_tomont_native(poly *data)
+
+static INLINE void poly_tomont_native(int16_t data[MLKEM_N])
 {
-  poly_tomont_asm_opt(data->coeffs);
+  poly_tomont_asm_opt(data);
 }
 
-static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y)
+static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
+                                                const int16_t y[MLKEM_N])
 {
-  poly_mulcache_compute_asm_opt(x->coeffs, y->coeffs,
-                                aarch64_zetas_mulcache_native,
+  poly_mulcache_compute_asm_opt(x, y, aarch64_zetas_mulcache_native,
                                 aarch64_zetas_mulcache_twisted_native);
 }
+
 static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache)
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)])
 {
-  polyvec_basemul_acc_montgomery_cached_asm_opt(
-      r->coeffs, a->vec[0].coeffs, b->vec[0].coeffs, b_cache->vec[0].coeffs);
+  polyvec_basemul_acc_montgomery_cached_asm_opt(r, a, b, b_cache);
 }
 
 static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a)
+                                       const int16_t a[MLKEM_N])
 {
-  poly_tobytes_asm_opt(r, a->coeffs);
+  poly_tobytes_asm_opt(r, a);
 }
 
 static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/optimize.sh b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/optimize.sh
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/optimize.sh
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/optimize.sh
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_clean.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_clean.S
index 809f9667e..f3ee0796f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_clean.S
@@ -3,7 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
 /*
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S
index 815a9dd1a..555c60a67 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S
@@ -3,7 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
 /*
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S
index c91675b44..0b6df6345 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S
@@ -9,7 +9,7 @@
 // https://eprint.iacr.org/2021/986
 // https://github.com/neon-ntt/neon-ntt
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN)
 
 // Input:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S
index 8300b682c..7a27fda3e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S
@@ -9,7 +9,7 @@
 // https://eprint.iacr.org/2021/986
 // https://github.com/neon-ntt/neon-ntt
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
 // Input:
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
index 5151a05d0..9158d6c82 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S
@@ -18,7 +18,7 @@
  *
  * Returns number of sampled 16-bit integers (at most MLKEM_N).
  **************************************************/
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
     defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_table.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c
index 507660349..29cdbe95f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_table.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c
@@ -8,7 +8,7 @@
  *          Do not modify it directly.
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \
     defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/api.h
new file mode 100644
index 000000000..0704f9dcd
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/api.h
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * Native arithmetic interface
+ *
+ * This header is primarily for documentation purposes.
+ * It should not be included by backend implementations.
+ *
+ * To ensure consistency with backends, the header will be
+ * included automatically after inclusion of the active
+ * backend, to ensure consistency of function signatures,
+ * and run sanity checks.
+ */
+#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H
+#error \
+    "The arithmetic backend API `mlkem/native/api.h` "		\
+    "should not be directly included. Please include the relevant "	\
+    "structure headers directly."
+#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
+#define MLKEM_NATIVE_ARITH_NATIVE_API_H
+
+#include <stdint.h>
+#include "../common.h"
+
+/*
+ * This is the C<->native interface allowing for the drop-in of
+ * native code for performance critical arithmetic components of ML-KEM.
+ *
+ * A _backend_ is a specific implementation of (part of) this interface.
+ *
+ * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and
+ * implement `static inline xxx(...)` in the profile header.
+ *
+ * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can
+ * be set if there are native implementations for all of NTT, invNTT, and
+ * base multiplication, and allows the native implementation to use a
+ * custom order of polynomial coefficients in NTT domain -- the use of such
+ * custom order is not an implementation-detail since the public matrix
+ * is generated in NTT domain. In this case, a permutation function
+ * poly_permute_bitrev_to_custom() needs to be provided that permutes
+ * polynomials in NTT domain from bitreversed to the custom order.
+ */
+
+/*
+ * Those functions are meant to be trivial wrappers around the chosen native
+ * implementation. The are static inline to avoid unnecessary calls.
+ * The macro before each declaration controls whether a native
+ * implementation is present.
+ */
+
+#if defined(MLKEM_USE_NATIVE_NTT)
+/*************************************************
+ * Name:        ntt_native
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input polynomial is assumed to be in normal order.
+ *              The output polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
+ **************************************************/
+static INLINE void ntt_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
+/*
+ * This must only be set if NTT, invNTT, basemul, mulcache, and
+ * to/from byte stream conversions all have native implementations
+ * that are adapted to the custom order.
+ */
+#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) ||                  \
+    !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) ||  \
+    !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) ||                           \
+    !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+#error \
+    "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \
+set if there are native implementations for NTT, invNTT, mulcache, basemul, \
+and to/from bytes conversions."
+#endif
+
+/*************************************************
+ * Name:        poly_permute_bitrev_to_custom
+ *
+ * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined,
+ *              convert a polynomial in NTT domain from bitreversed
+ *              order to the custom order output by the native NTT.
+ *
+ *              This must only be defined if there is native code for
+ *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
+ *
+ **************************************************/
+static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
+
+#if defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        intt_native
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place.
+ *
+ *              The input polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *              The output polynomial is assumed to be in normal order.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+static INLINE void intt_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_INTT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/*************************************************
+ * Name:        poly_reduce_native
+ *
+ * Description: Applies modular reduction to all coefficients of a polynomial.
+ *
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_reduce_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
+/*************************************************
+ * Name:        poly_tomont_native
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              from normal domain to Montgomery domain
+ *
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_tomont_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication cache for a polynomial
+ *              in NTT domain.
+ *
+ *              The purpose of the multiplication cache is to
+ *              cache repeated computations required during a
+ *              base multiplication of polynomials in NTT domain.
+ *              The structure of the multiplication-cache is
+ *              implementation defined.
+ *
+ * Arguments:   INPUT:
+ *              - poly: const pointer to input polynomial.
+ *                  This must be in NTT domain and inin bitreversed order, or of
+ *                  a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                  See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                  for more information.
+ *              OUTPUT
+ *              - cache: pointer to multiplication cache
+ **************************************************/
+static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2],
+                                                const int16_t poly[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication of polynomials in NTT domain.
+ *
+ * Arguments:   INPUT:
+ *              - a: First polynomial operand.
+ *                 This must be in NTT domain and inin bitreversed order, or of
+ *                 a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                 See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                 for more information.
+ *              - b: Second polynomial operand.
+ *                 As for a.
+ *              - b_cache: Multiplication-cache for b.
+ *              OUTPUT
+ *              - r: Result of the base multiplication. This is again
+ *                   in NTT domain, and of the same order as a and b.
+ **************************************************/
+static INLINE void polyvec_basemul_acc_montgomery_cached_native(
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]);
+#endif
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+/*************************************************
+ * Name:        poly_tobytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range -Q+1 .. Q-1
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+                                       const int16_t a[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+/*************************************************
+ * Name:        poly_frombytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - r: pointer to output polynomial in NTT domain
+ *              OUTPUT
+ *              - a: const pointer to input byte aray
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_frombytes_native(int16_t a[MLKEM_N],
+                                         const uint8_t r[MLKEM_POLYBYTES]);
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+/*************************************************
+ * Name:        rej_uniform_native
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int len:    requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes.
+ *
+ * Return -1 if the native implementation does not support the input lengths.
+ * Otherwise, returns non-negative number of sampled 16-bit integers (at most
+ * len).
+ **************************************************/
+static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
+                                     const uint8_t *buf, unsigned int buflen);
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/default.h
new file mode 100644
index 000000000..f9fe4310a
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/default.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
+#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
+
+/*
+ * Default arithmetic backend
+ */
+#include "../sys.h"
+
+#ifdef SYS_AARCH64
+/*
+ * For AArch64, we currently we have one clean and one opt profile.
+ * We default to the opt profile.
+ *
+ * In the future, this may branch further depending on the microarchitecture.
+ */
+#include "aarch64/opt.h"
+#endif /* SYS_AARCH64 */
+
+#ifdef SYS_X86_64_AVX2
+/*
+ * For now, there's only one x86_64 profile, based on
+ * the AVX2 code from the Kyber repository.
+ * https://github.com/pq-crystals/kyber
+ */
+#include "x86_64/default.h"
+#endif /* SYS_X86_64 */
+
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.c
deleted file mode 100644
index 3651c8da9..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-
-#include <stdint.h>
-#include "arith_backend.h"
-#include "debug.h"
-#include "ntt.h"
-#include "reduce.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
-#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
-#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
-/* End of static namespacing */
-
-#if !defined(MLKEM_USE_NATIVE_NTT)
-/*
- * Computes a block CT butterflies with a fixed twiddle factor,
- * using Montgomery multiplication.
- * Parameters:
- * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
- * - root: Twiddle factor to use for the butterfly. This must be in
- *         Montgomery form and signed canonical.
- * - start: Offset to the beginning of the butterfly block
- * - len: Index difference between coefficients subject to a butterfly
- * - bound: Ghost variable describing coefficient bound: Prior to `start`,
- *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
- *          they must be bound by `bound`.
- * When this function returns, output coefficients in the index range
- * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
- * Example:
- * - start=8, len=4
- *   This would compute the following four butterflies
- *          8     --    12
- *             9    --     13
- *                10   --     14
- *                   11   --     15
- * - start=4, len=2
- *   This would compute the following two butterflies
- *          4 -- 6
- *             5 -- 7
- */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
-                                unsigned start, unsigned len, int bound)
-__contract__(
-  requires(start < MLKEM_N)
-  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
-  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
-  requires(-HALF_Q < zeta && zeta < HALF_Q)
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
-  requires(array_abs_bound(r, start, MLKEM_N, bound))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
-  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
-{
-  /* `bound` is a ghost variable only needed in the CBMC specification */
-  unsigned j;
-  ((void)bound);
-  for (j = start; j < start + len; j++)
-  __loop__(
-    invariant(start <= j && j <= start + len)
-    /*
-     * Coefficients are updated in strided pairs, so the bounds for the
-     * intermediate states alternate twice between the old and new bound
-     */
-    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j,           start + len, bound))
-    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
-  {
-    int16_t t;
-    t = fqmul(r[j + len], zeta);
-    r[j + len] = r[j] - t;
-    r[j] = r[j] + t;
-  }
-}
-
-/*
- *Compute one layer of forward NTT
- * Parameters:
- * - r: Pointer to base of polynomial
- * - len: Stride of butterflies in this layer.
- * - layer: Ghost variable indicating which layer is being applied.
- *          Must match `len` via `len == MLKEM_N >> layer`.
- * Note: `len` could be dropped and computed in the function, but
- *   we are following the structure of the reference NTT from the
- *   official Kyber implementation here, merely adding `layer` as
- *   a ghost variable for the specifications.
- */
-static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
-  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable only needed in the CBMC specification */
-  ((void)layer);
-  /* Twiddle factors for layer n start at index 2^(layer-1) */
-  k = MLKEM_N / (2 * len);
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(start < MLKEM_N + 2 * len)
-    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
-    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
-    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
-  {
-    int16_t zeta = zetas[k++];
-    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
-  }
-}
-
-/*
- * Compute full forward NTT
- * NOTE: This particular implementation satisfies a much tighter
- * bound on the output coefficients (5*q) than the contractual one (8*q),
- * but this is not needed in the calling code. Should we change the
- * base multiplication strategy to require smaller NTT output bounds,
- * the proof may need strengthening.
- */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  unsigned len, layer;
-  int16_t *r;
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  r = p->coeffs;
-
-  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
-  __loop__(
-    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
-    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
-  {
-    ntt_layer(r, len, layer);
-  }
-
-  /* Check the stronger bound */
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_NTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  ntt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_NTT */
-
-#if !defined(MLKEM_USE_NATIVE_INTT)
-
-/* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
-  requires(len == (1 << (8 - layer)))
-  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable used only in the specification */
-  ((void)layer);
-  k = MLKEM_N / len - 1;
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(start <= MLKEM_N && k <= 127)
-    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
-    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
-  {
-    unsigned j;
-    int16_t zeta = zetas[k--];
-    for (j = start; j < start + len; j++)
-    __loop__(
-      invariant(start <= j && j <= start + len)
-      invariant(start <= MLKEM_N && k <= 127)
-      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-    {
-      int16_t t = r[j];
-      r[j] = barrett_reduce(t + r[j + len]);
-      r[j + len] = r[j + len] - t;
-      r[j + len] = fqmul(r[j + len], zeta);
-    }
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  /*
-   * Scale input polynomial to account for Montgomery factor
-   * and NTT twist. This also brings coefficients down to
-   * absolute value < MLKEM_Q.
-   */
-  unsigned j, len, layer;
-  const int16_t f = 1441;
-  int16_t *r = p->coeffs;
-
-  for (j = 0; j < MLKEM_N; j++)
-  __loop__(
-    invariant(j <= MLKEM_N)
-    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
-  {
-    r[j] = fqmul(r[j], f);
-  }
-
-  /* Run the invNTT layers */
-  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
-  __loop__(
-    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-  {
-    invntt_layer(p->coeffs, len, layer);
-  }
-
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  intt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-{
-  int32_t t0, t1;
-  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
-
-  t0 = (int32_t)a[1] * b_cached;
-  t0 += (int32_t)a[0] * b[0];
-  t1 = (int32_t)a[0] * b[1];
-  t1 += (int32_t)a[1] * b[0];
-
-  /* |ti| < 2 * q * 2^15 */
-  r[0] = montgomery_reduce(t0);
-  r[1] = montgomery_reduce(t1);
-
-  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
-}
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
-int empty_cu_ntt;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.h
deleted file mode 100644
index 4e80d3ab3..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/ntt.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef NTT_H
-#define NTT_H
-#include "common.h"
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "poly.h"
-#include "reduce.h"
-
-#define zetas MLKEM_NAMESPACE(zetas)
-extern const int16_t zetas[128];
-
-#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
-/*************************************************
- * Name:        poly_ntt
- *
- * Description: Computes negacyclic number-theoretic transform (NTT) of
- *              a polynomial in place.
- *
- *              The input is assumed to be in normal order and
- *              coefficient-wise bound by MLKEM_Q in absolute value.
- *
- *              The output polynomial is in bitreversed order, and
- *              coefficient-wise bound by NTT_BOUND in absolute value.
- *
- *              (NOTE: Sometimes the input to the NTT is actually smaller,
- *               which gives better bounds.)
- *
- * Arguments:   - poly *p: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
-);
-
-#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
-/*************************************************
- * Name:        poly_invntt_tomont
- *
- * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
- *              of a polynomial in place;
- *              inputs assumed to be in bitreversed order, output in normal
- *              order
- *
- *              The input is assumed to be in bitreversed order, and can
- *              have arbitrary coefficients in int16_t.
- *
- *              The output polynomial is in normal order, and
- *              coefficient-wise bound by INVNTT_BOUND in absolute value.
- *
- * Arguments:   - uint16_t *a: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
-);
-
-#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
-/************************************************************
- * Name: basemul_cached
- *
- * Description: Computes a representative modulo q of
- *              (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536
- *
- *              If b_cached is b1*zeta, this represents the
- *              product of (a0 + a1*X) and (b0 + b1*X) in
- *              Fq[X]/(X^2 - zeta).
- *
- * Arguments: - r: Pointer to output polynomial
- *                   Upon return, coefficients are bound by
- *                   2*MLKEM_Q in absolute value.
- *            - a: Pointer to first input polynomial
- *                   Every coefficient must be in [0..4095]
- *            - b: Pointer to second input polynomial
- *                   Can have arbitrary int16_t coefficients
- *            - b_cached: Some precomputed value, typically derived from
- *                   b1 and a twiddle factor. Can be an arbitary int16_t.
- ************************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-__contract__(
-  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
-  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
-  assigns(memory_slice(r, 2 * sizeof(int16_t)))
-  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
-);
-
-#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/params.h
index 57ea4c8ba..7f6c12625 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/params.h
@@ -18,6 +18,7 @@
 #define MLKEM_N 256
 #define MLKEM_Q 3329
 #define UINT12_LIMIT 4096
+#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
 
 #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */
 #define MLKEM_SSBYTES 32  /* size in bytes of shared key */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.c
index 7483ebf6d..e8a2e2c6e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.c
@@ -8,388 +8,246 @@
 #include <stdint.h>
 #include <string.h>
 #include "arith_backend.h"
-#include "cbd.h"
 #include "cbmc.h"
 #include "debug.h"
 #include "fips202x4.h"
-#include "ntt.h"
 #include "poly.h"
-#include "reduce.h"
+#include "sampling.h"
 #include "symmetric.h"
 #include "verify.h"
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j <= MLKEM_N / 4))
-  {
-    unsigned k;
-    uint16_t t[4];
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(forall(r, 0, k, t[r] < (1u << 10))))
-    {
-      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
-    }
-
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 10-bit in size.
-     */
-    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
-    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
-    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
-    r[5 * j + 4] = (t[3] >> 2);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
-  {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
+#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
+#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
+#define fqmul MLKEM_NAMESPACE(fqmul)
+#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
+#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
+#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
+#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
+#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
+#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        cast_uint16_to_int16
+ *
+ * Description: Cast uint16 value to int16
+ *
+ * Returns:
+ *   input x in     0 .. 32767: returns value unchanged
+ *   input x in 32768 .. 65535: returns (x - 65536)
+ **************************************************/
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+ALWAYS_INLINE
+static INLINE int16_t cast_uint16_to_int16(uint16_t x)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 4)
-    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[4];
-    uint8_t const *base = &a[5 * j];
-
-    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
-    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
-    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
-
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  /*
+   * PORTABILITY: This relies on uint16_t -> int16_t
+   * being implemented as the inverse of int16_t -> uint16_t,
+   * which is implementation-defined (C99 6.3.1.3 (3))
+   * CBMC (correctly) fails to prove this conversion is OK,
+   * so we have to suppress that check here
+   */
+  return (int16_t)x;
 }
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+/*************************************************
+ * Name:        montgomery_reduce_generic
+ *
+ * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
+ *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
+ *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
+ *
+ **************************************************/
+ALWAYS_INLINE
+static INLINE int16_t montgomery_reduce_generic(int32_t a)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
+  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 32)))
-    {
-      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
-    }
+  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
+  const uint16_t a_reduced = a & UINT16_MAX;
+  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
 
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC, and use array indexing into
-     * r rather than pointer-arithmetic to simplify verification
-     */
-    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
-    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
-    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
-    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
-    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
-  }
-}
+  /* Lift to signed canonical representative mod 2^16. */
+  const int16_t t = cast_uint16_to_int16(a_inverted);
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  int32_t r = a - ((int32_t)t * MLKEM_Q);
+  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
 
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j <= MLKEM_N / 8))
-  {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
-    {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
-    }
+  /*
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  r = r >> 16;
+  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
+   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
+   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *
+   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
+   */
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
-  }
+  return (int16_t)r;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+/*************************************************
+ * Name:        montgomery_reduce
+ *
+ * Description: Montgomery reduction
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q,
+ *              smaller than 2 * q in absolute value.
+ **************************************************/
+static INLINE int16_t montgomery_reduce(int32_t a)
+__contract__(
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
+  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
+)
 {
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    uint8_t t[8];
-    const unsigned offset = i * 5;
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC and unwind loop for ease
-     * of proof.
-     */
-
-    /*
-     * Decompress 5 8-bit bytes (so 40 bits) into
-     * 8 5-bit values stored in t[]
-     */
-    t[0] = 0x1F & (a[offset + 0] >> 0);
-    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
-    t[2] = 0x1F & (a[offset + 1] >> 2);
-    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
-    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
-    t[5] = 0x1F & (a[offset + 3] >> 1);
-    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
-    t[7] = 0x1F & (a[offset + 4] >> 3);
-
-    /* and copy to the correct slice in r[] */
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(j <= 8 && i <= MLKEM_N / 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  int16_t res;
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
+
+  res = montgomery_reduce_generic(a);
+  /* Bounds:
+   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
+   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
+   *        < 2 * MLKEM_Q */
+
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
+  return res;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+    !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        fqmul
+ *
+ * Description: Montgomery multiplication modulo q=3329
+ *
+ * Arguments:   - int16_t a: first factor
+ *                  Can be any int16_t.
+ *              - int16_t b: second factor.
+ *                  Must be signed canonical (abs value <(q+1)/2)
+ *
+ * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
+ * smaller than q in absolute value.
+ *
+ **************************************************/
+static INLINE int16_t fqmul(int16_t a, int16_t b)
+__contract__(
+  requires(b > -HALF_Q)
+  requires(b < HALF_Q)
+  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
+)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
-  }
+  int16_t res;
+  debug_assert_abs_bound(&b, 1, HALF_Q);
+
+  res = montgomery_reduce((int32_t)a * (int32_t)b);
+  /* Bounds:
+   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
+   *        < MLKEM_Q
+   */
 
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
+  return res;
 }
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
-
-#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+          !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+          !defined(MLKEM_USE_NATIVE_NTT) ||                   \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        barrett_reduce
+ *
+ * Description: Barrett reduction; given a 16-bit integer a, computes
+ *              centered representative congruent to a mod q in
+ *              {-(q-1)/2,...,(q-1)/2}
+ *
+ * Arguments:   - int16_t a: input integer to be reduced
+ *
+ * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
+ **************************************************/
+static INLINE int16_t barrett_reduce(int16_t a)
+__contract__(
+  ensures(return_value > -HALF_Q && return_value < HALF_Q)
+)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i <= MLKEM_N / 2))
-  {
-    const uint16_t t0 = a->coeffs[2 * i];
-    const uint16_t t1 = a->coeffs[2 * i + 1];
-    /*
-     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
-     * significant data, so these can be packed into 24 bits or exactly
-     * 3 bytes, as follows.
-     */
-
-    /* Least significant bits 0 - 7 of t0. */
-    r[3 * i + 0] = t0 & 0xFF;
-
-    /*
-     * Most significant bits 8 - 11 of t0 become the least significant
-     * nibble of the second byte. The least significant 4 bits
-     * of t1 become the upper nibble of the second byte.
-     */
-    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+  /*
+   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
+   * multiplier is round_to_nearest(2**26/MLKEM_Q)
+   */
+  const int BPOWER = 26;
+  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
 
-    /* Bits 4 - 11 of t1 become the third byte. */
-    r[3 * i + 2] = t1 >> 4;
-  }
-}
-#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-{
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  poly_tobytes_native(r, a);
-}
-#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+  /*
+   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
+   * above and shift by BPOWER places.
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
 
-#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
-  {
-    const uint8_t t0 = a[3 * i + 0];
-    const uint8_t t1 = a[3 * i + 1];
-    const uint8_t t2 = a[3 * i + 2];
-    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
-    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
-  }
+  /*
+   * t is in -10 .. +10, so we need 32-bit math to
+   * evaluate t * MLKEM_Q and the subsequent subtraction
+   */
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
 
-  /* Note that the coefficients are not canonical */
-  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
-}
-#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  poly_frombytes_native(r, a);
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
-#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
+                           int16_t b_cached)
+__contract__(
+  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
+  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
+  assigns(memory_slice(r, 2 * sizeof(int16_t)))
+  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)))
 {
-  unsigned i;
-#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
-#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
-#endif
+  int32_t t0, t1;
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <  MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      /* Prevent the compiler from recognizing this as a bit selection */
-      uint8_t mask = value_barrier_u8(1u << j);
-      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
-    }
-  }
-  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
-}
+  t0 = (int32_t)a[1] * b_cached;
+  t0 += (int32_t)a[0] * b[0];
+  t1 = (int32_t)a[0] * b[1];
+  t1 += (int32_t)a[1] * b[0];
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* |ti| < 2 * q * 2^15 */
+  r[0] = montgomery_reduce(t0);
+  r[1] = montgomery_reduce(t1);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    msg[i] = 0;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8))
-    {
-      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
-      msg[i] |= t << j;
-    }
-  }
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -434,12 +292,46 @@ void poly_tomont(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
-  poly_tomont_native(r);
+  poly_tomont_native(r->coeffs);
   debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
 #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/************************************************************
+ * Name: scalar_signed_to_unsigned_q
+ *
+ * Description: converts signed polynomial coefficient
+ *              from signed (-3328 .. 3328) form to
+ *              unsigned form (0 .. 3328).
+ *
+ * Note: Cryptographic constant time implementation
+ *
+ * Examples:       0 -> 0
+ *                 1 -> 1
+ *              3328 -> 3328
+ *                -1 -> 3328
+ *                -2 -> 3327
+ *             -3328 -> 1
+ *
+ * Arguments: c: signed coefficient to be converted
+ ************************************************************/
+static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
+__contract__(
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
+  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
+{
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
+  /* Add Q if c is negative, but in constant time */
+  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
+
+  /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
+  return (uint16_t)c;
+}
+
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
@@ -461,7 +353,7 @@ void poly_reduce(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
-  poly_reduce_native(r);
+  poly_reduce_native(r->coeffs);
   debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
@@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
-  poly_mulcache_compute_native(x, a);
+  poly_mulcache_compute_native(x->coeffs, a->coeffs);
   /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 
+#if !defined(MLKEM_USE_NATIVE_NTT)
+/*
+ * Computes a block CT butterflies with a fixed twiddle factor,
+ * using Montgomery multiplication.
+ * Parameters:
+ * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
+ * - root: Twiddle factor to use for the butterfly. This must be in
+ *         Montgomery form and signed canonical.
+ * - start: Offset to the beginning of the butterfly block
+ * - len: Index difference between coefficients subject to a butterfly
+ * - bound: Ghost variable describing coefficient bound: Prior to `start`,
+ *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
+ *          they must be bound by `bound`.
+ * When this function returns, output coefficients in the index range
+ * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
+ * Example:
+ * - start=8, len=4
+ *   This would compute the following four butterflies
+ *          8     --    12
+ *             9    --     13
+ *                10   --     14
+ *                   11   --     15
+ * - start=4, len=2
+ *   This would compute the following two butterflies
+ *          4 -- 6
+ *             5 -- 7
+ */
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
+__contract__(
+  requires(start < MLKEM_N)
+  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
+  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
+  requires(-HALF_Q < zeta && zeta < HALF_Q)
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
+  requires(array_abs_bound(r, start, MLKEM_N, bound))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
+  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
+{
+  /* `bound` is a ghost variable only needed in the CBMC specification */
+  unsigned j;
+  ((void)bound);
+  for (j = start; j < start + len; j++)
+  __loop__(
+    invariant(start <= j && j <= start + len)
+    /*
+     * Coefficients are updated in strided pairs, so the bounds for the
+     * intermediate states alternate twice between the old and new bound
+     */
+    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j,           start + len, bound))
+    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
+  {
+    int16_t t;
+    t = fqmul(r[j + len], zeta);
+    r[j + len] = r[j] - t;
+    r[j] = r[j] + t;
+  }
+}
+
+/*
+ *Compute one layer of forward NTT
+ * Parameters:
+ * - r: Pointer to base of polynomial
+ * - len: Stride of butterflies in this layer.
+ * - layer: Ghost variable indicating which layer is being applied.
+ *          Must match `len` via `len == MLKEM_N >> layer`.
+ * Note: `len` could be dropped and computed in the function, but
+ *   we are following the structure of the reference NTT from the
+ *   official Kyber implementation here, merely adding `layer` as
+ *   a ghost variable for the specifications.
+ */
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
+  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable only needed in the CBMC specification */
+  ((void)layer);
+  /* Twiddle factors for layer n start at index 2^(layer-1) */
+  k = MLKEM_N / (2 * len);
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
+    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
+  {
+    int16_t zeta = zetas[k++];
+    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
+  }
+}
+
+/*
+ * Compute full forward NTT
+ * NOTE: This particular implementation satisfies a much tighter
+ * bound on the output coefficients (5*q) than the contractual one (8*q),
+ * but this is not needed in the calling code. Should we change the
+ * base multiplication strategy to require smaller NTT output bounds,
+ * the proof may need strengthening.
+ */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  unsigned len, layer;
+  int16_t *r;
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  r = p->coeffs;
+
+  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
+  __loop__(
+    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
+    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
+  {
+    ntt_layer(r, len, layer);
+  }
+
+  /* Check the stronger bound */
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_NTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  ntt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if !defined(MLKEM_USE_NATIVE_INTT)
+
+/* Compute one layer of inverse NTT */
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
+  requires(len == (1 << (8 - layer)))
+  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable used only in the specification */
+  ((void)layer);
+  k = MLKEM_N / len - 1;
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+    invariant(start <= MLKEM_N && k <= 127)
+    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
+    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
+  {
+    unsigned j;
+    int16_t zeta = zetas[k--];
+    for (j = start; j < start + len; j++)
+    __loop__(
+      invariant(start <= j && j <= start + len)
+      invariant(start <= MLKEM_N && k <= 127)
+      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+    {
+      int16_t t = r[j];
+      r[j] = barrett_reduce(t + r[j + len]);
+      r[j + len] = r[j + len] - t;
+      r[j + len] = fqmul(r[j + len], zeta);
+    }
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  /*
+   * Scale input polynomial to account for Montgomery factor
+   * and NTT twist. This also brings coefficients down to
+   * absolute value < MLKEM_Q.
+   */
+  unsigned j, len, layer;
+  const int16_t f = 1441;
+  int16_t *r = p->coeffs;
+
+  for (j = 0; j < MLKEM_N; j++)
+  __loop__(
+    invariant(j <= MLKEM_N)
+    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
+  {
+    r[j] = fqmul(r[j], f);
+  }
+
+  /* Run the invNTT layers */
+  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
+  __loop__(
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+  {
+    invntt_layer(p->coeffs, len, layer);
+  }
+
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_INTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  intt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_INTT */
+
 #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.h
index 6a14c785d..cb0d67c1a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly.h
@@ -9,7 +9,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "reduce.h"
+#include "debug.h"
 #include "verify.h"
 
 /* Absolute exclusive upper bound for the output of the inverse NTT */
@@ -18,6 +18,9 @@
 /* Absolute exclusive upper bound for the output of the forward NTT */
 #define NTT_BOUND (8 * MLKEM_Q)
 
+#define zetas MLKEM_NAMESPACE(zetas)
+extern const int16_t zetas[128];
+
 /*
  * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
  * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
@@ -38,520 +41,6 @@ typedef struct
   int16_t coeffs[MLKEM_N >> 1];
 } poly_mulcache;
 
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
-#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
-#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
-#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
-#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
-#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
-#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
-#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
-#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
-#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
-/* End of static namespacing */
-
-/************************************************************
- * Name: scalar_compress_d1
- *
- * Description: Computes round(u * 2 / q)
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 1.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d1(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 2)
-  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
-{
-  uint32_t d0 = u << 1;
-  d0 *= 645083;
-  d0 += 1u << 30;
-  d0 >>= 31;
-  return d0;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_compress_d4
- *
- * Description: Computes round(u * 16 / q) % 16
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d4(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 16)
-  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
-{
-  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
-  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d4
- *
- * Description: Computes round(u * q / 16)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d4(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 16)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 8) / 16; }
-
-/************************************************************
- * Name: scalar_compress_d5
- *
- * Description: Computes round(u * 32 / q) % 32
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d5(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 32)
-  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
-{
-  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
-  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d5
- *
- * Description: Computes round(u * q / 32)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 32
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d5(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 32)
-  ensures(return_value <= MLKEM_Q - 1)
-) { return ((u * MLKEM_Q) + 16) / 32; }
-
-/************************************************************
- * Name: scalar_compress_d10
- *
- * Description: Computes round(u * 2**10 / q) % 2**10
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d10(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 10))
-  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
-{
-  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x3FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d10
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d10(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 1024)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 512) / 1024; }
-
-/************************************************************
- * Name: scalar_compress_d11
- *
- * Description: Computes round(u * 2**11 / q) % 2**11
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 11.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d11(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 11))
-  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
-{
-  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x7FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d11
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d11(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 2048)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 1024) / 2048; }
-
-/************************************************************
- * Name: scalar_signed_to_unsigned_q
- *
- * Description: converts signed polynomial coefficient
- *              from signed (-3328 .. 3328) form to
- *              unsigned form (0 .. 3328).
- *
- * Note: Cryptographic constant time implementation
- *
- * Examples:       0 -> 0
- *                 1 -> 1
- *              3328 -> 3328
- *                -1 -> 3328
- *                -2 -> 3327
- *             -3328 -> 1
- *
- * Arguments: c: signed coefficient to be converted
- ************************************************************/
-static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
-__contract__(
-  requires(c > -MLKEM_Q && c < MLKEM_Q)
-  ensures(return_value >= 0 && return_value < MLKEM_Q)
-  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
-{
-  debug_assert_abs_bound(&c, 1, MLKEM_Q);
-
-  /* Add Q if c is negative, but in constant time */
-  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
-
-  /* and therefore cast to uint16_t is safe. */
-  debug_assert_bound(&c, 1, 0, MLKEM_Q);
-  return (uint16_t)c;
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
-    (MLKEM_K == 2 || MLKEM_K == 3)
-#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
-/*************************************************
- * Name:        poly_compress_d4
- *
- * Description: Compression (4 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
-
-#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
-/*************************************************
- * Name:        poly_compress_d10
- *
- * Description: Compression (10 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
-
-#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
-/*************************************************
- * Name:        poly_decompress_d4
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
-
-#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
-/*************************************************
- * Name:        poly_decompress_d10
- *
- * Description: De-serialization and subsequent decompression (10 bits) of a
- *              polynomial; approximate inverse of poly_compress_d10
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
-/*************************************************
- * Name:        poly_compress_d5
- *
- * Description: Compression (5 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
-
-#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
-/*************************************************
- * Name:        poly_compress_d11
- *
- * Description: Compression (11 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
-
-#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
-/*************************************************
- * Name:        poly_decompress_d5
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
-
-#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
-/*************************************************
- * Name:        poly_decompress_d11
- *
- * Description: De-serialization and subsequent decompression (11 bits) of a
- *              polynomial; approximate inverse of poly_compress_d11
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
-        */
-
-#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
-/*************************************************
- * Name:        poly_tobytes
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - a: const pointer to input polynomial,
- *                with each coefficient in the range [0,1,..,Q-1]
- *              OUTPUT
- *              - r: pointer to output byte array
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYBYTES))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
-
-
-#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
-/*************************************************
- * Name:        poly_frombytes
- *
- * Description: De-serialization of a polynomial.
- *
- * Arguments:   INPUT
- *              - a: pointer to input byte array
- *                   (of MLKEM_POLYBYTES bytes)
- *              OUTPUT
- *              - r: pointer to output polynomial, with
- *                   each coefficient unsigned and in the range
- *                   0 .. 4095
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
-);
-
-
-#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
-/*************************************************
- * Name:        poly_frommsg
- *
- * Description: Convert 32-byte message to polynomial
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *msg: pointer to input message
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
-
-#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
-/*************************************************
- * Name:        poly_tomsg
- *
- * Description: Convert polynomial to 32-byte message
- *
- * Arguments:   - uint8_t *msg: pointer to output message
- *              - const poly *r: pointer to input polynomial
- *                Coefficients must be unsigned canonical
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(msg))
-);
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -715,4 +204,56 @@ __contract__(
   assigns(object_whole(r))
 );
 
+#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
+/*************************************************
+ * Name:        poly_ntt
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ *              (NOTE: Sometimes the input to the NTT is actually smaller,
+ *               which gives better bounds.)
+ *
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
+);
+
+#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
+/*************************************************
+ * Name:        poly_invntt_tomont
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place;
+ *              inputs assumed to be in bitreversed order, output in normal
+ *              order
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
+);
+
 #endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly_k.c
new file mode 100644
index 000000000..c2d330ea9
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly_k.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "poly_k.h"
+#include <stdint.h>
+#include <string.h>
+#include "arith_backend.h"
+#include "compress.h"
+#include "sampling.h"
+#include "symmetric.h"
+
+#include "debug.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
+#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
+/* End of static namespacing */
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+{
+  unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
+  }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+{
+  unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
+  }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_ntt(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_ntt(&r->vec[i]);
+  }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_invntt_tomont(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_invntt_tomont(&r->vec[i]);
+  }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
+}
+
+#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  unsigned i;
+  poly t;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+
+  poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
+  for (i = 1; i < MLKEM_K; i++)
+  {
+    poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
+                                   &b_cache->vec[i]);
+    poly_add(r, &t);
+  }
+
+  /*
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
+   */
+  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
+}
+#else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+  /* Omitting bounds assertion for cache since native implementations may
+   * decide not to use a mulcache. Note that the C backend implementation
+   * of poly_basemul_montgomery_cached() does still include the check. */
+  polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a,
+                                               (const int16_t *)b,
+                                               (const int16_t *)b_cache);
+}
+#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
+{
+  polyvec_mulcache b_cache;
+  polyvec_mulcache_compute(&b_cache, b);
+  polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_mulcache_compute(&x->vec[i], &a->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_reduce(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_reduce(&r->vec[i]);
+  }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_add(polyvec *r, const polyvec *b)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_add(&r->vec[i], &b->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tomont(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tomont(&r->vec[i]);
+  }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+}
+
+
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta1(poly *r,
+                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
+)
+{
+#if MLKEM_ETA1 == 2
+  poly_cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  poly_cbd3(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA1"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
+  memcpy(extkey0, seed, MLKEM_SYMBYTES);
+  memcpy(extkey1, seed, MLKEM_SYMBYTES);
+  memcpy(extkey2, seed, MLKEM_SYMBYTES);
+  memcpy(extkey3, seed, MLKEM_SYMBYTES);
+  extkey0[MLKEM_SYMBYTES] = nonce0;
+  extkey1[MLKEM_SYMBYTES] = nonce1;
+  extkey2[MLKEM_SYMBYTES] = nonce2;
+  extkey3[MLKEM_SYMBYTES] = nonce3;
+  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
+  poly_cbd_eta1(r0, buf0);
+  poly_cbd_eta1(r1, buf1);
+  poly_cbd_eta1(r2, buf2);
+  poly_cbd_eta1(r3, buf3);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+}
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+/*************************************************
+ * Name:        poly_cbd_eta2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta2(poly *r,
+                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
+{
+#if MLKEM_ETA2 == 2
+  poly_cbd2(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA2"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, seed, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+  prf_eta2(buf, extkey);
+
+  poly_cbd_eta2(r, buf);
+
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+}
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+
+#if MLKEM_K == 2
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+  prf_eta1(buf1[0], extkey[0]);
+  prf_eta1(buf1[1], extkey[1]);
+  prf_eta2(buf2[0], extkey[2]);
+  prf_eta2(buf2[1], extkey[3]);
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
+}
+#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly_k.h
new file mode 100644
index 000000000..0aea95912
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/poly_k.h
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef POLY_K_H
+#define POLY_K_H
+
+#include <stdint.h>
+#include "common.h"
+#include "compress.h"
+#include "poly.h"
+
+#define polyvec MLKEM_NAMESPACE_K(polyvec)
+typedef struct
+{
+  poly vec[MLKEM_K];
+} ALIGN polyvec;
+
+#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
+typedef struct
+{
+  poly_mulcache vec[MLKEM_K];
+} polyvec_mulcache;
+
+#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
+{
+#if MLKEM_DU == 10
+  poly_compress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_compress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *              polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_du(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DU == 10
+  poly_decompress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_decompress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r)))
+{
+#if MLKEM_DV == 4
+  poly_compress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_compress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+
+#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_dv(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DV == 4
+  poly_decompress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_decompress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
+/*************************************************
+ * Name:        polyvec_compress_du
+ *
+ * Description: Compress and serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ *              - const polyvec *a: pointer to input vector of polynomials.
+ *                                  Coefficients must be unsigned canonical,
+ *                                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(forall(k0, 0, MLKEM_K,
+         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
+/*************************************************
+ * Name:        polyvec_decompress_du
+ *
+ * Description: De-serialize and decompress vector of polynomials;
+ *              approximate inverse of polyvec_compress_du
+ *
+ * Arguments:   - polyvec *r:       pointer to output vector of polynomials.
+ *                Output will have coefficients normalized to [0,..,q-1].
+ *              - const uint8_t *a: pointer to input byte array
+ *                                  (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(k0, 0, MLKEM_K,
+         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+);
+
+#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
+/*************************************************
+ * Name:        polyvec_tobytes
+ *
+ * Description: Serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECBYTES)
+ *              - const polyvec *a: pointer to input vector of polynomials
+ *                  Each polynomial must have coefficients in [0,..,q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+__contract__(
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
+  requires(forall(k0, 0, MLKEM_K,
+         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
+/*************************************************
+ * Name:        polyvec_frombytes
+ *
+ * Description: De-serialize vector of polynomials;
+ *              inverse of polyvec_tobytes
+ *
+ * Arguments:   - const polyvec *a: pointer to output vector of polynomials
+ *                 (of length MLKEM_POLYVECBYTES). Output will have coefficients
+ *                 normalized in [0..4095].
+ *              - uint8_t *r: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
+  assigns(object_whole(r))
+  ensures(forall(k0, 0, MLKEM_K,
+        array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
+);
+
+#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
+/*************************************************
+ * Name:        polyvec_ntt
+ *
+ * Description: Apply forward NTT to all elements of a vector of polynomials.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_ntt(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+  assigns(object_whole(r))
+  ensures(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
+);
+
+#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
+/*************************************************
+ * Name:        polyvec_invntt_tomont
+ *
+ * Description: Apply inverse NTT to all elements of a vector of polynomials
+ *              and multiply by Montgomery factor 2^16
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_invntt_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND)))
+);
+
+#define polyvec_basemul_acc_montgomery \
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery
+ *
+ * Description: Multiply elements of a and b in NTT domain, accumulate into r,
+ *              and multiply by 2^-16.
+ *
+ * Arguments: - poly *r: pointer to output polynomial
+ *            - const polyvec *a: pointer to first input vector of polynomials
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(forall(k1, 0, MLKEM_K,
+    array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+
+#define polyvec_basemul_acc_montgomery_cached \
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery_cached
+ *
+ * Description: Scalar product of two vectors of polynomials in NTT domain,
+ *              using mulcache for second operand.
+ *
+ *              Bounds:
+ *              - Every coefficient of a is assumed to be in [0..4095]
+ *              - No bounds guarantees for the coefficients in the result.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const polyvec *a: pointer to first input polynomial vector
+ *              - const polyvec *b: pointer to second input polynomial vector
+ *              - const polyvec_mulcache *b_cache: pointer to mulcache
+ *                  for second input polynomial vector. Can be computed
+ *                  via polyvec_mulcache_compute().
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache)))
+  requires(forall(k1, 0, MLKEM_K,
+     array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
+/************************************************************
+ * Name: polyvec_mulcache_compute
+ *
+ * Description: Computes the mulcache for a vector of polynomials in NTT domain
+ *
+ *              The mulcache of a degree-2 polynomial b := b0 + b1*X
+ *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
+ *              computing products of b in Fq[X]/(X^2-zeta).
+ *
+ *              The mulcache of a polynomial in NTT domain -- which is
+ *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
+ *              for varying zeta, is the 128-tuple of mulcaches of those
+ *              polynomials.
+ *
+ *              The mulcache of a vector of polynomials is the vector
+ *              of mulcaches of its entries.
+ *
+ * Arguments: - x: Pointer to mulcache to be populated
+ *            - a: Pointer to input polynomial vector
+ ************************************************************/
+/*
+ * NOTE: The default C implementation of this function populates
+ * the mulcache with values in (-q,q), but this is not needed for the
+ * higher level safety proofs, and thus not part of the spec.
+ */
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+__contract__(
+  requires(memory_no_alias(x, sizeof(polyvec_mulcache)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  assigns(object_whole(x))
+);
+
+#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
+/*************************************************
+ * Name:        polyvec_reduce
+ *
+ * Description: Applies Barrett reduction to each coefficient
+ *              of each element of a vector of polynomials;
+ *              for details of the Barrett reduction see comments in reduce.c
+ *
+ * Arguments:   - polyvec *r: pointer to input/output polynomial
+ **************************************************/
+/*
+ * NOTE: The semantics of polyvec_reduce() is different in
+ *       the reference implementation, which requires
+ *       signed canonical output data. Unsigned canonical
+ *       outputs are better suited to the only remaining
+ *       use of poly_reduce() in the context of (de)serialization.
+ */
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_reduce(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(k0, 0, MLKEM_K,
+    array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+);
+
+#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
+/*************************************************
+ * Name:        polyvec_add
+ *
+ * Description: Add vectors of polynomials
+ *
+ * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be
+ *              added to
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ *
+ * The coefficients of r and b must be so that the addition does
+ * not overflow. Otherwise, the behaviour of this function is undefined.
+ *
+ * The coefficients returned in *r are in int16_t which is sufficient
+ * to prove type-safety of calling units. Therefore, no stronger
+ * ensures clause is required on this function.
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_add(polyvec *r, const polyvec *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(forall(j0, 0, MLKEM_K,
+          forall(k0, 0, MLKEM_N,
+            (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
+  requires(forall(j1, 0, MLKEM_K,
+          forall(k1, 0, MLKEM_N,
+            (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
+/*************************************************
+ * Name:        polyvec_tomont
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              vector from normal domain to Montgomery domain
+ *
+ *              Bounds: Output < q in absolute value.
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(memory_slice(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(j, 0, MLKEM_K,
+    array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+);
+
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+);
+#endif /* MLKEM_K == 2 */
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.c
deleted file mode 100644
index 50ea1c34a..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.c
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "polyvec.h"
-#include <stdint.h>
-#include <string.h>
-#include "arith_backend.h"
-#include "cbd.h"
-#include "ntt.h"
-#include "poly.h"
-#include "symmetric.h"
-
-#include "debug.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
-#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
-/* End of static namespacing */
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
-                         const polyvec *a)
-{
-  unsigned i;
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_decompress_du(polyvec *r,
-                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
-  }
-
-  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
-{
-  unsigned i;
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
-  }
-
-  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_ntt(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_ntt(&r->vec[i]);
-  }
-
-  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_invntt_tomont(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_invntt_tomont(&r->vec[i]);
-  }
-
-  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
-}
-
-#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
-                                           const polyvec *b,
-                                           const polyvec_mulcache *b_cache)
-{
-  unsigned i;
-  poly t;
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
-
-  poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
-  for (i = 1; i < MLKEM_K; i++)
-  {
-    poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
-                                   &b_cache->vec[i]);
-    poly_add(r, &t);
-  }
-
-  /*
-   * This bound is true for the C implementation, but not needed
-   * in the higher level bounds reasoning. It is thus omitted
-   * them from the spec to not unnecessarily constrain native
-   * implementations, but checked here nonetheless.
-   */
-  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
-}
-#else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
-                                           const polyvec *b,
-                                           const polyvec_mulcache *b_cache)
-{
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
-  /* Omitting bounds assertion for cache since native implementations may
-   * decide not to use a mulcache. Note that the C backend implementation
-   * of poly_basemul_montgomery_cached() does still include the check. */
-  polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
-}
-#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-{
-  polyvec_mulcache b_cache;
-  polyvec_mulcache_compute(&b_cache, b);
-  polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_mulcache_compute(&x->vec[i], &a->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_reduce(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_reduce(&r->vec[i]);
-  }
-
-  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_add(polyvec *r, const polyvec *b)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_add(&r->vec[i], &b->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tomont(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_tomont(&r->vec[i]);
-  }
-
-  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
-}
-
-
-/*************************************************
- * Name:        poly_cbd_eta1
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static INLINE void poly_cbd_eta1(poly *r,
-                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
-)
-{
-#if MLKEM_ETA1 == 2
-  poly_cbd2(r, buf);
-#elif MLKEM_ETA1 == 3
-  poly_cbd3(r, buf);
-#else
-#error "Invalid value of MLKEM_ETA1"
-#endif
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-{
-  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
-  memcpy(extkey0, seed, MLKEM_SYMBYTES);
-  memcpy(extkey1, seed, MLKEM_SYMBYTES);
-  memcpy(extkey2, seed, MLKEM_SYMBYTES);
-  memcpy(extkey3, seed, MLKEM_SYMBYTES);
-  extkey0[MLKEM_SYMBYTES] = nonce0;
-  extkey1[MLKEM_SYMBYTES] = nonce1;
-  extkey2[MLKEM_SYMBYTES] = nonce2;
-  extkey3[MLKEM_SYMBYTES] = nonce3;
-  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
-  poly_cbd_eta1(r0, buf0);
-  poly_cbd_eta1(r1, buf1);
-  poly_cbd_eta1(r2, buf2);
-  poly_cbd_eta1(r3, buf3);
-
-  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
-}
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-/*************************************************
- * Name:        poly_cbd_eta2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA2.
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static INLINE void poly_cbd_eta2(poly *r,
-                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
-{
-#if MLKEM_ETA2 == 2
-  poly_cbd2(r, buf);
-#else
-#error "Invalid value of MLKEM_ETA2"
-#endif
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-{
-  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
-
-  memcpy(extkey, seed, MLKEM_SYMBYTES);
-  extkey[MLKEM_SYMBYTES] = nonce;
-  prf_eta2(buf, extkey);
-
-  poly_cbd_eta2(r, buf);
-
-  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-
-#if MLKEM_K == 2
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
-  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
-  extkey[0][MLKEM_SYMBYTES] = nonce0;
-  extkey[1][MLKEM_SYMBYTES] = nonce1;
-  extkey[2][MLKEM_SYMBYTES] = nonce2;
-  extkey[3][MLKEM_SYMBYTES] = nonce3;
-
-  prf_eta1(buf1[0], extkey[0]);
-  prf_eta1(buf1[1], extkey[1]);
-  prf_eta2(buf2[0], extkey[2]);
-  prf_eta2(buf2[1], extkey[3]);
-
-  poly_cbd_eta1(r0, buf1[0]);
-  poly_cbd_eta1(r1, buf1[1]);
-  poly_cbd_eta2(r2, buf2[0]);
-  poly_cbd_eta2(r3, buf2[1]);
-
-  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
-  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
-}
-#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.h
deleted file mode 100644
index 8be8579e0..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/polyvec.h
+++ /dev/null
@@ -1,595 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef POLYVEC_H
-#define POLYVEC_H
-
-#include <stdint.h>
-#include "common.h"
-#include "poly.h"
-
-#define polyvec MLKEM_NAMESPACE_K(polyvec)
-typedef struct
-{
-  poly vec[MLKEM_K];
-} ALIGN polyvec;
-
-#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
-typedef struct
-{
-  poly_mulcache vec[MLKEM_K];
-} polyvec_mulcache;
-
-#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
-/*************************************************
- * Name:        poly_compress_du
- *
- * Description: Compression (du bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
-                                    const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
-{
-#if MLKEM_DU == 10
-  poly_compress_d10(r, a);
-#elif MLKEM_DU == 11
-  poly_compress_d11(r, a);
-#else
-#error "Invalid value of MLKEM_DU"
-#endif
-}
-
-#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
-/*************************************************
- * Name:        poly_decompress_du
- *
- * Description: De-serialization and subsequent decompression (du bits) of a
- *              polynomial; approximate inverse of poly_compress_du
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-static INLINE void poly_decompress_du(
-    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-#if MLKEM_DU == 10
-  poly_decompress_d10(r, a);
-#elif MLKEM_DU == 11
-  poly_decompress_d11(r, a);
-#else
-#error "Invalid value of MLKEM_DU"
-#endif
-}
-
-#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
-/*************************************************
- * Name:        poly_compress_dv
- *
- * Description: Compression (dv bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
-                                    const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r)))
-{
-#if MLKEM_DV == 4
-  poly_compress_d4(r, a);
-#elif MLKEM_DV == 5
-  poly_compress_d5(r, a);
-#else
-#error "Invalid value of MLKEM_DV"
-#endif
-}
-
-
-#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
-/*************************************************
- * Name:        poly_decompress_dv
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-static INLINE void poly_decompress_dv(
-    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-#if MLKEM_DV == 4
-  poly_decompress_d4(r, a);
-#elif MLKEM_DV == 5
-  poly_decompress_d5(r, a);
-#else
-#error "Invalid value of MLKEM_DV"
-#endif
-}
-
-#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
-/*************************************************
- * Name:        polyvec_compress_du
- *
- * Description: Compress and serialize vector of polynomials
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU)
- *              - const polyvec *a: pointer to input vector of polynomials.
- *                                  Coefficients must be unsigned canonical,
- *                                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
-                         const polyvec *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(forall(k0, 0, MLKEM_K,
-         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-  assigns(object_whole(r))
-);
-
-#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
-/*************************************************
- * Name:        polyvec_decompress_du
- *
- * Description: De-serialize and decompress vector of polynomials;
- *              approximate inverse of polyvec_compress_du
- *
- * Arguments:   - polyvec *r:       pointer to output vector of polynomials.
- *                Output will have coefficients normalized to [0,..,q-1].
- *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU)
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_decompress_du(polyvec *r,
-                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(k0, 0, MLKEM_K,
-         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-);
-
-#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
-/*************************************************
- * Name:        polyvec_tobytes
- *
- * Description: Serialize vector of polynomials
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (needs space for MLKEM_POLYVECBYTES)
- *              - const polyvec *a: pointer to input vector of polynomials
- *                  Each polynomial must have coefficients in [0,..,q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
-__contract__(
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
-  requires(forall(k0, 0, MLKEM_K,
-         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-  assigns(object_whole(r))
-);
-
-#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
-/*************************************************
- * Name:        polyvec_frombytes
- *
- * Description: De-serialize vector of polynomials;
- *              inverse of polyvec_tobytes
- *
- * Arguments:   - const polyvec *a: pointer to output vector of polynomials
- *                 (of length MLKEM_POLYVECBYTES). Output will have coefficients
- *                 normalized in [0..4095].
- *              - uint8_t *r: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
-  assigns(object_whole(r))
-  ensures(forall(k0, 0, MLKEM_K,
-        array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
-);
-
-#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
-/*************************************************
- * Name:        polyvec_ntt
- *
- * Description: Apply forward NTT to all elements of a vector of polynomials.
- *
- *              The input is assumed to be in normal order and
- *              coefficient-wise bound by MLKEM_Q in absolute value.
- *
- *              The output polynomial is in bitreversed order, and
- *              coefficient-wise bound by NTT_BOUND in absolute value.
- *
- * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_ntt(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  requires(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
-  assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
-);
-
-#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
-/*************************************************
- * Name:        polyvec_invntt_tomont
- *
- * Description: Apply inverse NTT to all elements of a vector of polynomials
- *              and multiply by Montgomery factor 2^16
- *
- *              The input is assumed to be in bitreversed order, and can
- *              have arbitrary coefficients in int16_t.
- *
- *              The output polynomial is in normal order, and
- *              coefficient-wise bound by INVNTT_BOUND in absolute value.
- *
- *
- * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_invntt_tomont(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND)))
-);
-
-#define polyvec_basemul_acc_montgomery \
-  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
-/*************************************************
- * Name:        polyvec_basemul_acc_montgomery
- *
- * Description: Multiply elements of a and b in NTT domain, accumulate into r,
- *              and multiply by 2^-16.
- *
- * Arguments: - poly *r: pointer to output polynomial
- *            - const polyvec *a: pointer to first input vector of polynomials
- *            - const polyvec *b: pointer to second input vector of polynomials
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(memory_no_alias(b, sizeof(polyvec)))
-  requires(forall(k1, 0, MLKEM_K,
-    array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
-  assigns(memory_slice(r, sizeof(poly)))
-);
-
-
-#define polyvec_basemul_acc_montgomery_cached \
-  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
-/*************************************************
- * Name:        polyvec_basemul_acc_montgomery_cached
- *
- * Description: Scalar product of two vectors of polynomials in NTT domain,
- *              using mulcache for second operand.
- *
- *              Bounds:
- *              - Every coefficient of a is assumed to be in [0..4095]
- *              - No bounds guarantees for the coefficients in the result.
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const polyvec *a: pointer to first input polynomial vector
- *              - const polyvec *b: pointer to second input polynomial vector
- *              - const polyvec_mulcache *b_cache: pointer to mulcache
- *                  for second input polynomial vector. Can be computed
- *                  via polyvec_mulcache_compute().
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
-                                           const polyvec *b,
-                                           const polyvec_mulcache *b_cache)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(memory_no_alias(b, sizeof(polyvec)))
-  requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache)))
-  requires(forall(k1, 0, MLKEM_K,
-     array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
-  assigns(memory_slice(r, sizeof(poly)))
-);
-
-#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
-/************************************************************
- * Name: polyvec_mulcache_compute
- *
- * Description: Computes the mulcache for a vector of polynomials in NTT domain
- *
- *              The mulcache of a degree-2 polynomial b := b0 + b1*X
- *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
- *              computing products of b in Fq[X]/(X^2-zeta).
- *
- *              The mulcache of a polynomial in NTT domain -- which is
- *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
- *              for varying zeta, is the 128-tuple of mulcaches of those
- *              polynomials.
- *
- *              The mulcache of a vector of polynomials is the vector
- *              of mulcaches of its entries.
- *
- * Arguments: - x: Pointer to mulcache to be populated
- *            - a: Pointer to input polynomial vector
- ************************************************************/
-/*
- * NOTE: The default C implementation of this function populates
- * the mulcache with values in (-q,q), but this is not needed for the
- * higher level safety proofs, and thus not part of the spec.
- */
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
-__contract__(
-  requires(memory_no_alias(x, sizeof(polyvec_mulcache)))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  assigns(object_whole(x))
-);
-
-#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
-/*************************************************
- * Name:        polyvec_reduce
- *
- * Description: Applies Barrett reduction to each coefficient
- *              of each element of a vector of polynomials;
- *              for details of the Barrett reduction see comments in reduce.c
- *
- * Arguments:   - polyvec *r: pointer to input/output polynomial
- **************************************************/
-/*
- * NOTE: The semantics of polyvec_reduce() is different in
- *       the reference implementation, which requires
- *       signed canonical output data. Unsigned canonical
- *       outputs are better suited to the only remaining
- *       use of poly_reduce() in the context of (de)serialization.
- */
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_reduce(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(k0, 0, MLKEM_K,
-    array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-);
-
-#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
-/*************************************************
- * Name:        polyvec_add
- *
- * Description: Add vectors of polynomials
- *
- * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be
- *              added to
- *            - const polyvec *b: pointer to second input vector of polynomials
- *
- * The coefficients of r and b must be so that the addition does
- * not overflow. Otherwise, the behaviour of this function is undefined.
- *
- * The coefficients returned in *r are in int16_t which is sufficient
- * to prove type-safety of calling units. Therefore, no stronger
- * ensures clause is required on this function.
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_add(polyvec *r, const polyvec *b)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  requires(memory_no_alias(b, sizeof(polyvec)))
-  requires(forall(j0, 0, MLKEM_K,
-          forall(k0, 0, MLKEM_N,
-            (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
-  requires(forall(j1, 0, MLKEM_K,
-          forall(k1, 0, MLKEM_N,
-            (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
-  assigns(object_whole(r))
-);
-
-#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
-/*************************************************
- * Name:        polyvec_tomont
- *
- * Description: Inplace conversion of all coefficients of a polynomial
- *              vector from normal domain to Montgomery domain
- *
- *              Bounds: Output < q in absolute value.
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tomont(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(memory_slice(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-    array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
-);
-
-#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and nonces, with output polynomials close to centered binomial distribution
- * with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-/* Depending on MLKEM_K, the pointers passed to this function belong
-   to the same objects, so we cannot use memory_no_alias for r0-r3.
-
-   NOTE: Somehow it is important to use memory_no_alias() first in the
-         conjunctions defining each case.
-*/
-#if MLKEM_K == 2
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
-    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 4
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case B: r0, r1, r2, r3 consecutive */
-    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 3
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case C: r0, r1, r2 consecutive */
- (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
-  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#endif /* MLKEM_K */
-
-#if MLKEM_ETA1 == MLKEM_ETA2
-/*
- * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
- * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
- * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
- */
-#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
-#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
-/*************************************************
- * Name:        poly_getnoise_eta2
- *
- * Description: Sample a polynomial deterministically from a seed and a nonce,
- *              with output polynomial close to centered binomial distribution
- *              with parameter MLKEM_ETA2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1122_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and a nonces, with output polynomials close to centered binomial
- * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-__contract__(
-  requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
-  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
-     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
-);
-#endif /* MLKEM_K == 2 */
-
-#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/reduce.h
deleted file mode 100644
index b432a4201..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/reduce.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "common.h"
-#include "debug.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
-#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
-#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
-#define fqmul MLKEM_NAMESPACE(fqmul)
-#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
-/* End of static namespacing */
-
-#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
-
-/*************************************************
- * Name:        cast_uint16_to_int16
- *
- * Description: Cast uint16 value to int16
- *
- * Returns:
- *   input x in     0 .. 32767: returns value unchanged
- *   input x in 32768 .. 65535: returns (x - 65536)
- **************************************************/
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
-ALWAYS_INLINE
-static INLINE int16_t cast_uint16_to_int16(uint16_t x)
-{
-  /*
-   * PORTABILITY: This relies on uint16_t -> int16_t
-   * being implemented as the inverse of int16_t -> uint16_t,
-   * which is implementation-defined (C99 6.3.1.3 (3))
-   * CBMC (correctly) fails to prove this conversion is OK,
-   * so we have to suppress that check here
-   */
-  return (int16_t)x;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/*************************************************
- * Name:        montgomery_reduce_generic
- *
- * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
- *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *
- * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
- *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
- *
- **************************************************/
-ALWAYS_INLINE
-static INLINE int16_t montgomery_reduce_generic(int32_t a)
-{
-  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
-  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
-
-  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
-  const uint16_t a_reduced = a & UINT16_MAX;
-  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
-
-  /* Lift to signed canonical representative mod 2^16. */
-  const int16_t t = cast_uint16_to_int16(a_inverted);
-
-  int32_t r = a - ((int32_t)t * MLKEM_Q);
-  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
-
-  /*
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  r = r >> 16;
-  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
-   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
-   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *
-   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
-   */
-
-  return (int16_t)r;
-}
-
-/*************************************************
- * Name:        montgomery_reduce
- *
- * Description: Montgomery reduction
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
- *
- * Returns:     integer congruent to a * R^-1 modulo q,
- *              smaller than 2 * q in absolute value.
- **************************************************/
-static INLINE int16_t montgomery_reduce(int32_t a)
-__contract__(
-  requires(a > -(2 * UINT12_LIMIT * 32768))
-  requires(a <  (2 * UINT12_LIMIT * 32768))
-  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
-
-  res = montgomery_reduce_generic(a);
-  /* Bounds:
-   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
-   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
-   *        < 2 * MLKEM_Q */
-
-  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        fqmul
- *
- * Description: Montgomery multiplication modulo q=3329
- *
- * Arguments:   - int16_t a: first factor
- *                  Can be any int16_t.
- *              - int16_t b: second factor.
- *                  Must be signed canonical (abs value <(q+1)/2)
- *
- * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
- * smaller than q in absolute value.
- *
- **************************************************/
-static INLINE int16_t fqmul(int16_t a, int16_t b)
-__contract__(
-  requires(b > -HALF_Q)
-  requires(b < HALF_Q)
-  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&b, 1, HALF_Q);
-
-  res = montgomery_reduce((int32_t)a * (int32_t)b);
-  /* Bounds:
-   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
-   *        < MLKEM_Q
-   */
-
-  debug_assert_abs_bound(&res, 1, MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        barrett_reduce
- *
- * Description: Barrett reduction; given a 16-bit integer a, computes
- *              centered representative congruent to a mod q in
- *              {-(q-1)/2,...,(q-1)/2}
- *
- * Arguments:   - int16_t a: input integer to be reduced
- *
- * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
- **************************************************/
-static INLINE int16_t barrett_reduce(int16_t a)
-__contract__(
-  ensures(return_value > -HALF_Q && return_value < HALF_Q)
-)
-{
-  /*
-   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
-   * multiplier is round_to_nearest(2**26/MLKEM_Q)
-   */
-  const int BPOWER = 26;
-  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
-
-  /*
-   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
-   * above and shift by BPOWER places.
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
-
-  /*
-   * t is in -10 .. +10, so we need 32-bit math to
-   * evaluate t * MLKEM_Q and the subsequent subtraction
-   */
-  int16_t res = (int16_t)(a - t * MLKEM_Q);
-
-  debug_assert_abs_bound(&res, 1, HALF_Q);
-  return res;
-}
-
-#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.c
deleted file mode 100644
index cbbe4407f..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-
-#include "arith_backend.h"
-#include "debug.h"
-#include "fips202.h"
-#include "fips202x4.h"
-#include "rej_uniform.h"
-#include "symmetric.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
-#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
-/* End of static namespacing */
-
-static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
-                                       unsigned int offset, const uint8_t *buf,
-                                       unsigned int buflen)
-__contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
-{
-  unsigned int ctr, pos;
-  uint16_t val0, val1;
-
-  debug_assert_bound(r, offset, 0, MLKEM_Q);
-
-  ctr = offset;
-  pos = 0;
-  /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
-  while (ctr < target && pos + 3 <= buflen)
-  __loop__(
-    invariant(offset <= ctr && ctr <= target && pos <= buflen)
-    invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q)))
-  {
-    val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
-    val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
-    pos += 3;
-
-    if (val0 < MLKEM_Q)
-    {
-      r[ctr++] = val0;
-    }
-    if (ctr < target && val1 < MLKEM_Q)
-    {
-      r[ctr++] = val1;
-    }
-  }
-
-  debug_assert_bound(r, ctr, 0, MLKEM_Q);
-  return ctr;
-}
-
-#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-/*************************************************
- * Name:        rej_uniform
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
- *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
- **************************************************/
-
-/*
- * NOTE: The signature differs from the Kyber reference implementation
- * in that it adds the offset and always expects the base of the target
- * buffer. This avoids shifting the buffer base in the caller, which appears
- * tricky to reason about.
- */
-static unsigned int rej_uniform(int16_t *r, unsigned int target,
-                                unsigned int offset, const uint8_t *buf,
-                                unsigned int buflen)
-__contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
-{
-  return rej_uniform_scalar(r, target, offset, buf, buflen);
-}
-#else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-static unsigned int rej_uniform(int16_t *r, unsigned int target,
-                                unsigned int offset, const uint8_t *buf,
-                                unsigned int buflen)
-{
-  int ret;
-
-  /* Sample from large buffer with full lane as much as possible. */
-  ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
-  if (ret != -1)
-  {
-    unsigned res = offset + (unsigned)ret;
-    debug_assert_bound(r, res, 0, MLKEM_Q);
-    return res;
-  }
-
-  return rej_uniform_scalar(r, target, offset, buf, buflen);
-}
-#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
-  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
-#endif
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
-{
-  /* Temporary buffers for XOF output before rejection sampling */
-  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-
-  /* Tracks the number of coefficients we have already sampled */
-  unsigned int ctr[KECCAK_WAY];
-  xof_x4_ctx statex;
-  unsigned int buflen;
-
-  shake128x4_inc_init(&statex);
-
-  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
-  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
-                MLKEM_SYMBYTES + 2);
-
-  /*
-   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   * This should generate the matrix entries with high probability.
-   */
-  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
-                       &statex);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
-  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
-  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
-  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
-
-  /*
-   * So long as not all matrix entries have been generated, squeeze
-   * one more block a time until we're done.
-   */
-  buflen = XOF_RATE;
-  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
-         ctr[3] < MLKEM_N)
-  __loop__(
-    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
-       object_whole(buf1), object_whole(buf2), object_whole(buf3))
-    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
-    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
-    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
-    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
-    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
-    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
-  {
-    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
-    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
-    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
-    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
-    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
-  }
-
-  xof_x4_release(&statex);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-{
-  xof_ctx state;
-  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  unsigned int ctr, buflen;
-
-  shake128_inc_init(&state);
-
-  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
-
-  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   */
-  /* This should generate the matrix entry with high probability. */
-  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
-
-  /* Squeeze + sample one more block a time until we're done */
-  buflen = XOF_RATE;
-  while (ctr < MLKEM_N)
-  __loop__(
-    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
-    invariant(ctr <= MLKEM_N)
-    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
-  {
-    xof_squeezeblocks(buf, 1, &state);
-    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
-  }
-
-  xof_release(&state);
-}
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
-int empty_cu_rej_uniform;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.h
deleted file mode 100644
index 801287259..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/rej_uniform.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef REJ_UNIFORM_H
-#define REJ_UNIFORM_H
-
-#include <stdint.h>
-#include <stdlib.h>
-#include "cbmc.h"
-#include "common.h"
-#include "poly.h"
-
-#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
-/*************************************************
- * Name:        poly_rej_uniform_x4
- *
- * Description: Generate four polynomials using rejection sampling
- *              on (pseudo-)uniformly random bytes sampled from a seed.
- *
- * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
- *                                     to be sampled.
- *              - uint8_t *seed[4]:    Pointer to array of four pointers
- *                                     pointing to the seed buffers of size
- *                                     MLKEM_SYMBYTES + 2 each.
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
-__contract__(
-  requires(memory_no_alias(vec, sizeof(poly) * 4))
-  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
-  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(vec, sizeof(poly) * 4))
-  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
-
-#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
-/*************************************************
- * Name:        poly_rej_uniform
- *
- * Description: Generate polynomial using rejection sampling
- *              on (pseudo-)uniformly random bytes sampled from a seed.
- *
- * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
- *              - uint8_t *seed:       Pointer to seed buffer of size
- *                                     MLKEM_SYMBYTES + 2 each.
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-__contract__(
-  requires(memory_no_alias(entry, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(entry, sizeof(poly)))
-  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
-
-#endif /* REJ_UNIFORM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/sampling.c
new file mode 100644
index 000000000..98cbdcb74
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/sampling.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
+#include "arith_backend.h"
+#include "debug.h"
+#include "fips202.h"
+#include "fips202x4.h"
+#include "sampling.h"
+#include "symmetric.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
+/* End of static namespacing */
+
+static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
+                                       unsigned int offset, const uint8_t *buf,
+                                       unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
+{
+  unsigned int ctr, pos;
+  uint16_t val0, val1;
+
+  debug_assert_bound(r, offset, 0, MLKEM_Q);
+
+  ctr = offset;
+  pos = 0;
+  /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
+  while (ctr < target && pos + 3 <= buflen)
+  __loop__(
+    invariant(offset <= ctr && ctr <= target && pos <= buflen)
+    invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q)))
+  {
+    val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
+    val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+    pos += 3;
+
+    if (val0 < MLKEM_Q)
+    {
+      r[ctr++] = val0;
+    }
+    if (ctr < target && val1 < MLKEM_Q)
+    {
+      r[ctr++] = val1;
+    }
+  }
+
+  debug_assert_bound(r, ctr, 0, MLKEM_Q);
+  return ctr;
+}
+
+#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * NOTE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
+{
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+{
+  int ret;
+
+  /* Sample from large buffer with full lane as much as possible. */
+  ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
+  if (ret != -1)
+  {
+    unsigned res = offset + (unsigned)ret;
+    debug_assert_bound(r, res, 0, MLKEM_Q);
+    return res;
+  }
+
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
+#endif
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  xof_x4_ctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
+                MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                       &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = XOF_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+  {
+    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  xof_x4_release(&statex);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+{
+  xof_ctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = XOF_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(ctr <= MLKEM_N)
+    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
+  {
+    xof_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
+  }
+
+  xof_release(&state);
+}
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        load32_littleendian
+ *
+ * Description: load 4 bytes into a 32-bit integer
+ *              in little-endian order
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x
+ **************************************************/
+static uint32_t load32_littleendian(const uint8_t x[4])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  r |= (uint32_t)x[3] << 24;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
+  {
+    unsigned j;
+    uint32_t t = load32_littleendian(buf + 4 * i);
+    uint32_t d = t & 0x55555555;
+    d += (t >> 1) & 0x55555555;
+
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
+    {
+      const int16_t a = (d >> (4 * j + 0)) & 0x3;
+      const int16_t b = (d >> (4 * j + 2)) & 0x3;
+      r->coeffs[8 * i + j] = a - b;
+    }
+  }
+}
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+/*************************************************
+ * Name:        load24_littleendian
+ *
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+ **************************************************/
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
+  {
+    unsigned j;
+    const uint32_t t = load24_littleendian(buf + 3 * i);
+    uint32_t d = t & 0x00249249;
+    d += (t >> 1) & 0x00249249;
+    d += (t >> 2) & 0x00249249;
+
+    for (j = 0; j < 4; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 4 && j <= 4)
+      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
+    {
+      const int16_t a = (d >> (6 * j + 0)) & 0x7;
+      const int16_t b = (d >> (6 * j + 3)) & 0x7;
+      r->coeffs[4 * i + j] = a - b;
+    }
+  }
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling)
+int empty_cu_sampling;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/sampling.h
new file mode 100644
index 000000000..cc524e0fc
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/sampling.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef SAMPLING_H
+#define SAMPLING_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "cbmc.h"
+#include "common.h"
+#include "poly.h"
+
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
+/*************************************************
+ * Name:        poly_cbd2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
+/*************************************************
+ * Name:        poly_cbd3
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
+
+#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
+/*************************************************
+ * Name:        poly_rej_uniform_x4
+ *
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
+ *                                     to be sampled.
+ *              - uint8_t *seed[4]:    Pointer to array of four pointers
+ *                                     pointing to the seed buffers of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
+/*************************************************
+ * Name:        poly_rej_uniform
+ *
+ * Description: Generate polynomial using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
+ *              - uint8_t *seed:       Pointer to seed buffer of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+__contract__(
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#endif /* SAMPLING_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c
index 4ef887c62..987f0dce4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c
@@ -10,7 +10,7 @@
 
 #include "common.h"
 #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-#include "ntt.h"
+#include "poly.h"
 
 /*
  * Table of zeta values used in the reference NTT and inverse NTT.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/api.h
deleted file mode 100644
index 792ecb8a4..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/api.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * Native arithmetic interface
- *
- * This header is primarily for documentation purposes.
- * It should not be included by backend implementations.
- *
- * To ensure consistency with backends, the header will be
- * included automatically after inclusion of the active
- * backend, to ensure consistency of function signatures,
- * and run sanity checks.
- */
-#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H
-#error \
-    "The arithmetic backend API `mlkem/native/api.h` "		\
-    "should not be directly included. Please include the relevant "	\
-    "structure headers directly."
-#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
-#define MLKEM_NATIVE_ARITH_NATIVE_API_H
-
-#include <stdint.h>
-#include "poly.h"
-#include "polyvec.h"
-
-/*
- * This is the C<->native interface allowing for the drop-in of
- * native code for performance critical arithmetic components of ML-KEM.
- *
- * A _backend_ is a specific implementation of (part of) this interface.
- *
- * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and
- * implement `static inline xxx(...)` in the profile header.
- *
- * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can
- * be set if there are native implementations for all of NTT, invNTT, and
- * base multiplication, and allows the native implementation to use a
- * custom order of polynomial coefficients in NTT domain -- the use of such
- * custom order is not an implementation-detail since the public matrix
- * is generated in NTT domain. In this case, a permutation function
- * poly_permute_bitrev_to_custom() needs to be provided that permutes
- * polynomials in NTT domain from bitreversed to the custom order.
- */
-
-/*
- * Those functions are meant to be trivial wrappers around the chosen native
- * implementation. The are static inline to avoid unnecessary calls.
- * The macro before each declaration controls whether a native
- * implementation is present.
- */
-
-#if defined(MLKEM_USE_NATIVE_NTT)
-/*************************************************
- * Name:        ntt_native
- *
- * Description: Computes negacyclic number-theoretic transform (NTT) of
- *              a polynomial in place.
- *
- *              The input polynomial is assumed to be in normal order.
- *              The output polynomial is in bitreversed order, or of a
- *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *              for more information.
- *
- * Arguments:   - poly *p: pointer to in/output polynomial
- **************************************************/
-static INLINE void ntt_native(poly *);
-#endif /* MLKEM_USE_NATIVE_NTT */
-
-#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
-/*
- * This must only be set if NTT, invNTT, basemul, mulcache, and
- * to/from byte stream conversions all have native implementations
- * that are adapted to the custom order.
- */
-#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \
-    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) ||                  \
-    !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) ||  \
-    !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) ||                           \
-    !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-#error \
-    "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \
-set if there are native implementations for NTT, invNTT, mulcache, basemul, \
-and to/from bytes conversions."
-#endif
-
-/*************************************************
- * Name:        poly_permute_bitrev_to_custom
- *
- * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined,
- *              convert a polynomial in NTT domain from bitreversed
- *              order to the custom order output by the native NTT.
- *
- *              This must only be defined if there is native code for
- *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
- * Arguments:   - poly *p: pointer to in/output polynomial
- *
- **************************************************/
-static INLINE void poly_permute_bitrev_to_custom(poly *);
-#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
-
-#if defined(MLKEM_USE_NATIVE_INTT)
-/*************************************************
- * Name:        intt_native
- *
- * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
- *              of a polynomial in place.
- *
- *              The input polynomial is in bitreversed order, or of a
- *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *              for more information.
- *              The output polynomial is assumed to be in normal order.
- *
- * Arguments:   - uint16_t *a: pointer to in/output polynomial
- **************************************************/
-static INLINE void intt_native(poly *);
-#endif /* MLKEM_USE_NATIVE_INTT */
-
-#if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
-/*************************************************
- * Name:        poly_reduce_native
- *
- * Description: Applies modular reduction to all coefficients of a polynomial.
- *
- * Arguments:   - poly *r: pointer to input/output polynomial
- **************************************************/
-static INLINE void poly_reduce_native(poly *);
-#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
-
-#if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
-/*************************************************
- * Name:        poly_tomont_native
- *
- * Description: Inplace conversion of all coefficients of a polynomial
- *              from normal domain to Montgomery domain
- *
- * Arguments:   - poly *r: pointer to input/output polynomial
- **************************************************/
-static INLINE void poly_tomont_native(poly *);
-#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
-
-#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
-/*************************************************
- * Name:        poly_mulcache_compute_native
- *
- * Description: Compute multiplication cache for a polynomial
- *              in NTT domain.
- *
- *              The purpose of the multiplication cache is to
- *              cache repeated computations required during a
- *              base multiplication of polynomials in NTT domain.
- *              The structure of the multiplication-cache is
- *              implementation defined.
- *
- * Arguments:   INPUT:
- *              - poly: const pointer to input polynomial.
- *                  This must be in NTT domain and inin bitreversed order, or of
- *                  a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *                  See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *                  for more information.
- *              OUTPUT
- *              - cache: pointer to multiplication cache
- **************************************************/
-static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
-                                                const poly *poly);
-#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
-
-#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
-/*************************************************
- * Name:        poly_mulcache_compute_native
- *
- * Description: Compute multiplication of polynomials in NTT domain.
- *
- * Arguments:   INPUT:
- *              - a: First polynomial operand.
- *                 This must be in NTT domain and inin bitreversed order, or of
- *                 a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *                 See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *                 for more information.
- *              - b: Second polynomial operand.
- *                 As for a.
- *              - b_cache: Multiplication-cache for b.
- *              OUTPUT
- *              - r: Result of the base multiplication. This is again
- *                   in NTT domain, and of the same order as a and b.
- **************************************************/
-static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache);
-#endif
-
-#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
-/*************************************************
- * Name:        poly_tobytes_native
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - a: const pointer to input polynomial,
- *                with each coefficient in the range -Q+1 .. Q-1
- *              OUTPUT
- *              - r: pointer to output byte array
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a);
-#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
-
-#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-/*************************************************
- * Name:        poly_frombytes_native
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - r: pointer to output polynomial in NTT domain
- *              OUTPUT
- *              - a: const pointer to input byte aray
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-static INLINE void poly_frombytes_native(poly *a,
-                                         const uint8_t r[MLKEM_POLYBYTES]);
-#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-
-#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-/*************************************************
- * Name:        rej_uniform_native
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int len:    requested number of 16-bit integers
- *                                     (uniform mod q).
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes.
- *
- * Return -1 if the native implementation does not support the input lengths.
- * Otherwise, returns non-negative number of sampled 16-bit integers (at most
- * len).
- **************************************************/
-static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
-                                     const uint8_t *buf, unsigned int buflen);
-#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_backend.h
index 0543b1bd1..ade31cda1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/arith_backend.h
@@ -17,7 +17,7 @@
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
 #if defined(MLKEM_NATIVE_CHECK_APIS)
-#include "api.h"
+#include "native/api.h"
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.c
deleted file mode 100644
index 1e6b7c5d1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
-
-#include <stdint.h>
-#include "cbd.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
-#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-/* End of static namespacing */
-
-/*************************************************
- * Name:        load32_littleendian
- *
- * Description: load 4 bytes into a 32-bit integer
- *              in little-endian order
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x
- **************************************************/
-static uint32_t load32_littleendian(const uint8_t x[4])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  r |= (uint32_t)x[3] << 24;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
-  {
-    unsigned j;
-    uint32_t t = load32_littleendian(buf + 4 * i);
-    uint32_t d = t & 0x55555555;
-    d += (t >> 1) & 0x55555555;
-
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
-    {
-      const int16_t a = (d >> (4 * j + 0)) & 0x3;
-      const int16_t b = (d >> (4 * j + 2)) & 0x3;
-      r->coeffs[8 * i + j] = a - b;
-    }
-  }
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 4)
-    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
-  {
-    unsigned j;
-    const uint32_t t = load24_littleendian(buf + 3 * i);
-    uint32_t d = t & 0x00249249;
-    d += (t >> 1) & 0x00249249;
-    d += (t >> 2) & 0x00249249;
-
-    for (j = 0; j < 4; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 4 && j <= 4)
-      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
-    {
-      const int16_t a = (d >> (6 * j + 0)) & 0x7;
-      const int16_t b = (d >> (6 * j + 3)) & 0x7;
-      r->coeffs[4 * i + j] = a - b;
-    }
-  }
-}
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
-          3 */
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
-int empty_cu_cbd;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.h
deleted file mode 100644
index 54c1f5b90..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/cbd.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef CBD_H
-#define CBD_H
-
-#include <stdint.h>
-#include "common.h"
-#include "poly.h"
-
-#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
-/*************************************************
- * Name:        poly_cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
-/*************************************************
- * Name:        poly_cbd3
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
-
-#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h
index 4f326333e..62ed53ab1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h
@@ -15,12 +15,19 @@
 #include "sys.h"
 
 /* Include backend metadata */
-#if defined(MLKEM_USE_NATIVE)
-#if defined(MLKEM_NATIVE_ARITH_BACKEND)
-#include MLKEM_NATIVE_ARITH_BACKEND
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#include MLKEM_NATIVE_ARITH_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not.
+#endif
 #endif
-#if defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#include MLKEM_NATIVE_FIPS202_BACKEND
+
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#include MLKEM_NATIVE_FIPS202_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not.
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/compress.c
new file mode 100644
index 000000000..a03fe0ac4
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/compress.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
+#include <stdint.h>
+#include <string.h>
+#include "arith_backend.h"
+#include "cbmc.h"
+#include "compress.h"
+#include "debug.h"
+#include "verify.h"
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
+    {
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
+    }
+
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(invariant(j <= MLKEM_N / 4))
+  {
+    unsigned k;
+    uint16_t t[4];
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(forall(r, 0, k, t[r] < (1u << 10))))
+    {
+      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 10-bit in size.
+     */
+    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
+    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
+    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
+    r[5 * j + 4] = (t[3] >> 2);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  {
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 4)
+    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[4];
+    uint8_t const *base = &a[5 * j];
+
+    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
+    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
+    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
+
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 32)))
+    {
+      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
+    }
+
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC, and use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     */
+    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
+    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
+    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
+  {
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    uint8_t t[8];
+    const unsigned offset = i * 5;
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC and unwind loop for ease
+     * of proof.
+     */
+
+    /*
+     * Decompress 5 8-bit bytes (so 40 bits) into
+     * 8 5-bit values stored in t[]
+     */
+    t[0] = 0x1F & (a[offset + 0] >> 0);
+    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
+    t[2] = 0x1F & (a[offset + 1] >> 2);
+    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
+    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
+    t[5] = 0x1F & (a[offset + 3] >> 1);
+    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
+    t[7] = 0x1F & (a[offset + 4] >> 3);
+
+    /* and copy to the correct slice in r[] */
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(j <= 8 && i <= MLKEM_N / 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(invariant(i <= MLKEM_N / 2))
+  {
+    const uint16_t t0 = a->coeffs[2 * i];
+    const uint16_t t1 = a->coeffs[2 * i + 1];
+    /*
+     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
+     * significant data, so these can be packed into 24 bits or exactly
+     * 3 bytes, as follows.
+     */
+
+    /* Least significant bits 0 - 7 of t0. */
+    r[3 * i + 0] = t0 & 0xFF;
+
+    /*
+     * Most significant bits 8 - 11 of t0 become the least significant
+     * nibble of the second byte. The least significant 4 bits
+     * of t1 become the upper nibble of the second byte.
+     */
+    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+
+    /* Bits 4 - 11 of t1 become the third byte. */
+    r[3 * i + 2] = t1 >> 4;
+  }
+}
+#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  poly_tobytes_native(r, a->coeffs);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
+  {
+    const uint8_t t0 = a[3 * i + 0];
+    const uint8_t t1 = a[3 * i + 1];
+    const uint8_t t2 = a[3 * i + 2];
+    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
+    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+  }
+
+  /* Note that the coefficients are not canonical */
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  poly_frombytes_native(r->coeffs, a);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+{
+  unsigned i;
+#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
+#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
+#endif
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <  MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      /* Prevent the compiler from recognizing this as a bit selection */
+      uint8_t mask = value_barrier_u8(1u << j);
+      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
+    }
+  }
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    msg[i] = 0;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8))
+    {
+      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
+      msg[i] |= t << j;
+    }
+  }
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress)
+int empty_cu_compress;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/compress.h
new file mode 100644
index 000000000..409dbe519
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/compress.h
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef COMPRESS_H
+#define COMPRESS_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+#include "debug.h"
+#include "poly.h"
+#include "verify.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
+#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
+#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
+#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
+#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
+#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
+#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
+#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
+#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
+/* End of static namespacing */
+
+/************************************************************
+ * Name: scalar_compress_d1
+ *
+ * Description: Computes round(u * 2 / q)
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 1.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d1(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 2)
+  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
+{
+  uint32_t d0 = u << 1;
+  d0 *= 645083;
+  d0 += 1u << 30;
+  d0 >>= 31;
+  return d0;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_compress_d4
+ *
+ * Description: Computes round(u * 16 / q) % 16
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d4(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 16)
+  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
+{
+  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
+  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d4
+ *
+ * Description: Computes round(u * q / 16)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d4(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 16)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 8) / 16; }
+
+/************************************************************
+ * Name: scalar_compress_d5
+ *
+ * Description: Computes round(u * 32 / q) % 32
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d5(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 32)
+  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
+{
+  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
+  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d5
+ *
+ * Description: Computes round(u * q / 32)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 32
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d5(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 32)
+  ensures(return_value <= MLKEM_Q - 1)
+) { return ((u * MLKEM_Q) + 16) / 32; }
+
+/************************************************************
+ * Name: scalar_compress_d10
+ *
+ * Description: Computes round(u * 2**10 / q) % 2**10
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d10(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 10))
+  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
+{
+  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x3FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d10
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d10(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 1024)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 512) / 1024; }
+
+/************************************************************
+ * Name: scalar_compress_d11
+ *
+ * Description: Computes round(u * 2**11 / q) % 2**11
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 11.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d11(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 11))
+  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
+{
+  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x7FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d11
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d11(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 2048)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 1024) / 2048; }
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
+/*************************************************
+ * Name:        poly_compress_d4
+ *
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
+
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
+/*************************************************
+ * Name:        poly_decompress_d4
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
+/*************************************************
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
+
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
+/*************************************************
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
+
+#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
+/*************************************************
+ * Name:        poly_tobytes
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range [0,1,..,Q-1]
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYBYTES))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r))
+);
+
+
+#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
+/*************************************************
+ * Name:        poly_frombytes
+ *
+ * Description: De-serialization of a polynomial.
+ *
+ * Arguments:   INPUT
+ *              - a: pointer to input byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ *              OUTPUT
+ *              - r: pointer to output polynomial, with
+ *                   each coefficient unsigned and in the range
+ *                   0 .. 4095
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
+);
+
+
+#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
+/*************************************************
+ * Name:        poly_frommsg
+ *
+ * Description: Convert 32-byte message to polynomial
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *msg: pointer to input message
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
+#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
+/*************************************************
+ * Name:        poly_tomsg
+ *
+ * Description: Convert polynomial to 32-byte message
+ *
+ * Arguments:   - uint8_t *msg: pointer to output message
+ *              - const poly *r: pointer to input polynomial
+ *                Coefficients must be unsigned canonical
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(msg))
+);
+
+#endif /* COMPRESS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h
index fa89370ce..e975ede95 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h
@@ -122,46 +122,87 @@
 /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
- * Name:        MLKEM_USE_NATIVE
+ * Name:        MLKEM_USE_NATIVE_BACKEND_ARITH
  *
- * Description: Determines whether a native backend should
- *              be used, if available.
+ * Description: Determines whether an native arithmetic backend should be used.
+ *
+ *              The arithmetic backend covers performance critical functions
+ *              such as the number-theoretic transform (NTT).
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the arithmetic backend to be use is
+ *              determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(MLKEM_USE_NATIVE)
-/* #define MLKEM_USE_NATIVE */
+#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */
 #endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_ARITH_BACKEND
+ * Name:        MLKEM_NATIVE_ARITH_BACKEND_FILE
  *
  * Description: The arithmetic backend to use.
  *
- *              This must be the filename of an arithmetic backend.
- *              See the existing backends for examples.
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option
+ *              is ignored.
+ *
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must
+ *              either be undefined or the filename of an arithmetic backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND)
-#define MLKEM_NATIVE_ARITH_BACKEND "default.h"
-#endif /* MLKEM_NATIVE_ARITH_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \
+    !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h"
+#endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_FIPS202_BACKEND
+ * Name:        MLKEM_USE_NATIVE_BACKEND_FIPS202
+ *
+ * Description: Determines whether an native FIPS202 backend should be used.
+ *
+ *              The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is
+ *              the performance bottleneck of SHA3 and SHAKE.
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the FIPS202 backend to be use is
+ *              determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */
+#endif
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_FIPS202_BACKEND_FILE
  *
  * Description: The FIPS-202 backend to use.
  *
- *              This must be the filename of an FIPS-202 backend.
+ *              If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must
+ *              either be undefined or the filename of a FIPS202 backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h"
-#endif /* MLKEM_NATIVE_FIPS202_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \
+    !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h"
+#endif
 
 /*************************  Config internals  ********************************/
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/default.h
deleted file mode 100644
index d1e41c52e..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/default.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
-#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
-
-/*
- * Default arithmetic backend
- */
-#include "sys.h"
-
-#ifdef SYS_AARCH64
-/*
- * For AArch64, we currently we have one clean and one opt profile.
- * We default to the opt profile.
- *
- * In the future, this may branch further depending on the microarchitecture.
- */
-#include "aarch64/opt.h"
-#endif /* SYS_AARCH64 */
-
-#ifdef SYS_X86_64_AVX2
-/*
- * For now, there's only one x86_64 profile, based on
- * the AVX2 code from the Kyber repository.
- * https://github.com/pq-crystals/kyber
- */
-#include "x86_64/default.h"
-#endif /* SYS_X86_64 */
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c
index 0cfcc3e9e..318d0fc77 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.c
@@ -9,11 +9,10 @@
 #include "fips202.h"
 #include "fips202x4.h"
 #include "indcpa.h"
-#include "ntt.h"
 #include "poly.h"
-#include "polyvec.h"
+#include "poly_k.h"
 #include "randombytes.h"
-#include "rej_uniform.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 #include "arith_backend.h"
@@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v,
 #define poly_permute_bitrev_to_custom \
   MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
-static INLINE void poly_permute_bitrev_to_custom(poly *data)
+static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
 __contract__(
   /* We don't specify that this should be a permutation, but only
    * that it does not change the bound established at the end of gen_matrix. */
-  requires(memory_no_alias(data, sizeof(poly)))
-  requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
+  requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
   assigns(memory_slice(data, sizeof(poly)))
-  ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
+  ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
 #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
 
 /* Not static for benchmarking */
@@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
   {
     for (j = 0; j < MLKEM_K; j++)
     {
-      poly_permute_bitrev_to_custom(&a[i].vec[j]);
+      poly_permute_bitrev_to_custom(a[i].vec[j].coeffs);
     }
   }
 }
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h
index 2c4fda3c4..b4d5985bf 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/indcpa.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "polyvec.h"
+#include "poly_k.h"
 
 #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/api.h
new file mode 100644
index 000000000..0704f9dcd
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/api.h
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * Native arithmetic interface
+ *
+ * This header is primarily for documentation purposes.
+ * It should not be included by backend implementations.
+ *
+ * To ensure consistency with backends, the header will be
+ * included automatically after inclusion of the active
+ * backend, to ensure consistency of function signatures,
+ * and run sanity checks.
+ */
+#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H
+#error \
+    "The arithmetic backend API `mlkem/native/api.h` "		\
+    "should not be directly included. Please include the relevant "	\
+    "structure headers directly."
+#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
+#define MLKEM_NATIVE_ARITH_NATIVE_API_H
+
+#include <stdint.h>
+#include "../common.h"
+
+/*
+ * This is the C<->native interface allowing for the drop-in of
+ * native code for performance critical arithmetic components of ML-KEM.
+ *
+ * A _backend_ is a specific implementation of (part of) this interface.
+ *
+ * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and
+ * implement `static inline xxx(...)` in the profile header.
+ *
+ * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can
+ * be set if there are native implementations for all of NTT, invNTT, and
+ * base multiplication, and allows the native implementation to use a
+ * custom order of polynomial coefficients in NTT domain -- the use of such
+ * custom order is not an implementation-detail since the public matrix
+ * is generated in NTT domain. In this case, a permutation function
+ * poly_permute_bitrev_to_custom() needs to be provided that permutes
+ * polynomials in NTT domain from bitreversed to the custom order.
+ */
+
+/*
+ * Those functions are meant to be trivial wrappers around the chosen native
+ * implementation. The are static inline to avoid unnecessary calls.
+ * The macro before each declaration controls whether a native
+ * implementation is present.
+ */
+
+#if defined(MLKEM_USE_NATIVE_NTT)
+/*************************************************
+ * Name:        ntt_native
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input polynomial is assumed to be in normal order.
+ *              The output polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
+ **************************************************/
+static INLINE void ntt_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
+/*
+ * This must only be set if NTT, invNTT, basemul, mulcache, and
+ * to/from byte stream conversions all have native implementations
+ * that are adapted to the custom order.
+ */
+#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) ||                  \
+    !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) ||  \
+    !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) ||                           \
+    !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+#error \
+    "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \
+set if there are native implementations for NTT, invNTT, mulcache, basemul, \
+and to/from bytes conversions."
+#endif
+
+/*************************************************
+ * Name:        poly_permute_bitrev_to_custom
+ *
+ * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined,
+ *              convert a polynomial in NTT domain from bitreversed
+ *              order to the custom order output by the native NTT.
+ *
+ *              This must only be defined if there is native code for
+ *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
+ *
+ **************************************************/
+static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
+
+#if defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        intt_native
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place.
+ *
+ *              The input polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *              The output polynomial is assumed to be in normal order.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+static INLINE void intt_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_INTT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/*************************************************
+ * Name:        poly_reduce_native
+ *
+ * Description: Applies modular reduction to all coefficients of a polynomial.
+ *
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_reduce_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
+/*************************************************
+ * Name:        poly_tomont_native
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              from normal domain to Montgomery domain
+ *
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_tomont_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication cache for a polynomial
+ *              in NTT domain.
+ *
+ *              The purpose of the multiplication cache is to
+ *              cache repeated computations required during a
+ *              base multiplication of polynomials in NTT domain.
+ *              The structure of the multiplication-cache is
+ *              implementation defined.
+ *
+ * Arguments:   INPUT:
+ *              - poly: const pointer to input polynomial.
+ *                  This must be in NTT domain and inin bitreversed order, or of
+ *                  a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                  See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                  for more information.
+ *              OUTPUT
+ *              - cache: pointer to multiplication cache
+ **************************************************/
+static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2],
+                                                const int16_t poly[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication of polynomials in NTT domain.
+ *
+ * Arguments:   INPUT:
+ *              - a: First polynomial operand.
+ *                 This must be in NTT domain and inin bitreversed order, or of
+ *                 a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                 See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                 for more information.
+ *              - b: Second polynomial operand.
+ *                 As for a.
+ *              - b_cache: Multiplication-cache for b.
+ *              OUTPUT
+ *              - r: Result of the base multiplication. This is again
+ *                   in NTT domain, and of the same order as a and b.
+ **************************************************/
+static INLINE void polyvec_basemul_acc_montgomery_cached_native(
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]);
+#endif
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+/*************************************************
+ * Name:        poly_tobytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range -Q+1 .. Q-1
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+                                       const int16_t a[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+/*************************************************
+ * Name:        poly_frombytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - r: pointer to output polynomial in NTT domain
+ *              OUTPUT
+ *              - a: const pointer to input byte aray
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_frombytes_native(int16_t a[MLKEM_N],
+                                         const uint8_t r[MLKEM_POLYBYTES]);
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+/*************************************************
+ * Name:        rej_uniform_native
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int len:    requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes.
+ *
+ * Return -1 if the native implementation does not support the input lengths.
+ * Otherwise, returns non-negative number of sampled 16-bit integers (at most
+ * len).
+ **************************************************/
+static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
+                                     const uint8_t *buf, unsigned int buflen);
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/default.h
new file mode 100644
index 000000000..f9fe4310a
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/default.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
+#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
+
+/*
+ * Default arithmetic backend
+ */
+#include "../sys.h"
+
+#ifdef SYS_AARCH64
+/*
+ * For AArch64, we currently we have one clean and one opt profile.
+ * We default to the opt profile.
+ *
+ * In the future, this may branch further depending on the microarchitecture.
+ */
+#include "aarch64/opt.h"
+#endif /* SYS_AARCH64 */
+
+#ifdef SYS_X86_64_AVX2
+/*
+ * For now, there's only one x86_64 profile, based on
+ * the AVX2 code from the Kyber repository.
+ * https://github.com/pq-crystals/kyber
+ */
+#include "x86_64/default.h"
+#endif /* SYS_X86_64 */
+
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.c
deleted file mode 100644
index 3651c8da9..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-
-#include <stdint.h>
-#include "arith_backend.h"
-#include "debug.h"
-#include "ntt.h"
-#include "reduce.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
-#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
-#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
-/* End of static namespacing */
-
-#if !defined(MLKEM_USE_NATIVE_NTT)
-/*
- * Computes a block CT butterflies with a fixed twiddle factor,
- * using Montgomery multiplication.
- * Parameters:
- * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
- * - root: Twiddle factor to use for the butterfly. This must be in
- *         Montgomery form and signed canonical.
- * - start: Offset to the beginning of the butterfly block
- * - len: Index difference between coefficients subject to a butterfly
- * - bound: Ghost variable describing coefficient bound: Prior to `start`,
- *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
- *          they must be bound by `bound`.
- * When this function returns, output coefficients in the index range
- * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
- * Example:
- * - start=8, len=4
- *   This would compute the following four butterflies
- *          8     --    12
- *             9    --     13
- *                10   --     14
- *                   11   --     15
- * - start=4, len=2
- *   This would compute the following two butterflies
- *          4 -- 6
- *             5 -- 7
- */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
-                                unsigned start, unsigned len, int bound)
-__contract__(
-  requires(start < MLKEM_N)
-  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
-  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
-  requires(-HALF_Q < zeta && zeta < HALF_Q)
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
-  requires(array_abs_bound(r, start, MLKEM_N, bound))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
-  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
-{
-  /* `bound` is a ghost variable only needed in the CBMC specification */
-  unsigned j;
-  ((void)bound);
-  for (j = start; j < start + len; j++)
-  __loop__(
-    invariant(start <= j && j <= start + len)
-    /*
-     * Coefficients are updated in strided pairs, so the bounds for the
-     * intermediate states alternate twice between the old and new bound
-     */
-    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j,           start + len, bound))
-    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
-  {
-    int16_t t;
-    t = fqmul(r[j + len], zeta);
-    r[j + len] = r[j] - t;
-    r[j] = r[j] + t;
-  }
-}
-
-/*
- *Compute one layer of forward NTT
- * Parameters:
- * - r: Pointer to base of polynomial
- * - len: Stride of butterflies in this layer.
- * - layer: Ghost variable indicating which layer is being applied.
- *          Must match `len` via `len == MLKEM_N >> layer`.
- * Note: `len` could be dropped and computed in the function, but
- *   we are following the structure of the reference NTT from the
- *   official Kyber implementation here, merely adding `layer` as
- *   a ghost variable for the specifications.
- */
-static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
-  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable only needed in the CBMC specification */
-  ((void)layer);
-  /* Twiddle factors for layer n start at index 2^(layer-1) */
-  k = MLKEM_N / (2 * len);
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(start < MLKEM_N + 2 * len)
-    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
-    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
-    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
-  {
-    int16_t zeta = zetas[k++];
-    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
-  }
-}
-
-/*
- * Compute full forward NTT
- * NOTE: This particular implementation satisfies a much tighter
- * bound on the output coefficients (5*q) than the contractual one (8*q),
- * but this is not needed in the calling code. Should we change the
- * base multiplication strategy to require smaller NTT output bounds,
- * the proof may need strengthening.
- */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  unsigned len, layer;
-  int16_t *r;
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  r = p->coeffs;
-
-  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
-  __loop__(
-    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
-    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
-  {
-    ntt_layer(r, len, layer);
-  }
-
-  /* Check the stronger bound */
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_NTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  ntt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_NTT */
-
-#if !defined(MLKEM_USE_NATIVE_INTT)
-
-/* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
-  requires(len == (1 << (8 - layer)))
-  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable used only in the specification */
-  ((void)layer);
-  k = MLKEM_N / len - 1;
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(start <= MLKEM_N && k <= 127)
-    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
-    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
-  {
-    unsigned j;
-    int16_t zeta = zetas[k--];
-    for (j = start; j < start + len; j++)
-    __loop__(
-      invariant(start <= j && j <= start + len)
-      invariant(start <= MLKEM_N && k <= 127)
-      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-    {
-      int16_t t = r[j];
-      r[j] = barrett_reduce(t + r[j + len]);
-      r[j + len] = r[j + len] - t;
-      r[j + len] = fqmul(r[j + len], zeta);
-    }
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  /*
-   * Scale input polynomial to account for Montgomery factor
-   * and NTT twist. This also brings coefficients down to
-   * absolute value < MLKEM_Q.
-   */
-  unsigned j, len, layer;
-  const int16_t f = 1441;
-  int16_t *r = p->coeffs;
-
-  for (j = 0; j < MLKEM_N; j++)
-  __loop__(
-    invariant(j <= MLKEM_N)
-    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
-  {
-    r[j] = fqmul(r[j], f);
-  }
-
-  /* Run the invNTT layers */
-  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
-  __loop__(
-    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-  {
-    invntt_layer(p->coeffs, len, layer);
-  }
-
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  intt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-{
-  int32_t t0, t1;
-  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
-
-  t0 = (int32_t)a[1] * b_cached;
-  t0 += (int32_t)a[0] * b[0];
-  t1 = (int32_t)a[0] * b[1];
-  t1 += (int32_t)a[1] * b[0];
-
-  /* |ti| < 2 * q * 2^15 */
-  r[0] = montgomery_reduce(t0);
-  r[1] = montgomery_reduce(t1);
-
-  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
-}
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
-int empty_cu_ntt;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.h
deleted file mode 100644
index 4e80d3ab3..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/ntt.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef NTT_H
-#define NTT_H
-#include "common.h"
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "poly.h"
-#include "reduce.h"
-
-#define zetas MLKEM_NAMESPACE(zetas)
-extern const int16_t zetas[128];
-
-#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
-/*************************************************
- * Name:        poly_ntt
- *
- * Description: Computes negacyclic number-theoretic transform (NTT) of
- *              a polynomial in place.
- *
- *              The input is assumed to be in normal order and
- *              coefficient-wise bound by MLKEM_Q in absolute value.
- *
- *              The output polynomial is in bitreversed order, and
- *              coefficient-wise bound by NTT_BOUND in absolute value.
- *
- *              (NOTE: Sometimes the input to the NTT is actually smaller,
- *               which gives better bounds.)
- *
- * Arguments:   - poly *p: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
-);
-
-#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
-/*************************************************
- * Name:        poly_invntt_tomont
- *
- * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
- *              of a polynomial in place;
- *              inputs assumed to be in bitreversed order, output in normal
- *              order
- *
- *              The input is assumed to be in bitreversed order, and can
- *              have arbitrary coefficients in int16_t.
- *
- *              The output polynomial is in normal order, and
- *              coefficient-wise bound by INVNTT_BOUND in absolute value.
- *
- * Arguments:   - uint16_t *a: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
-);
-
-#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
-/************************************************************
- * Name: basemul_cached
- *
- * Description: Computes a representative modulo q of
- *              (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536
- *
- *              If b_cached is b1*zeta, this represents the
- *              product of (a0 + a1*X) and (b0 + b1*X) in
- *              Fq[X]/(X^2 - zeta).
- *
- * Arguments: - r: Pointer to output polynomial
- *                   Upon return, coefficients are bound by
- *                   2*MLKEM_Q in absolute value.
- *            - a: Pointer to first input polynomial
- *                   Every coefficient must be in [0..4095]
- *            - b: Pointer to second input polynomial
- *                   Can have arbitrary int16_t coefficients
- *            - b_cached: Some precomputed value, typically derived from
- *                   b1 and a twiddle factor. Can be an arbitary int16_t.
- ************************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-__contract__(
-  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
-  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
-  assigns(memory_slice(r, 2 * sizeof(int16_t)))
-  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
-);
-
-#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h
index 57ea4c8ba..7f6c12625 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/params.h
@@ -18,6 +18,7 @@
 #define MLKEM_N 256
 #define MLKEM_Q 3329
 #define UINT12_LIMIT 4096
+#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
 
 #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */
 #define MLKEM_SSBYTES 32  /* size in bytes of shared key */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c
index 7483ebf6d..e8a2e2c6e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.c
@@ -8,388 +8,246 @@
 #include <stdint.h>
 #include <string.h>
 #include "arith_backend.h"
-#include "cbd.h"
 #include "cbmc.h"
 #include "debug.h"
 #include "fips202x4.h"
-#include "ntt.h"
 #include "poly.h"
-#include "reduce.h"
+#include "sampling.h"
 #include "symmetric.h"
 #include "verify.h"
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j <= MLKEM_N / 4))
-  {
-    unsigned k;
-    uint16_t t[4];
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(forall(r, 0, k, t[r] < (1u << 10))))
-    {
-      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
-    }
-
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 10-bit in size.
-     */
-    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
-    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
-    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
-    r[5 * j + 4] = (t[3] >> 2);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
-  {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
+#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
+#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
+#define fqmul MLKEM_NAMESPACE(fqmul)
+#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
+#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
+#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
+#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
+#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
+#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        cast_uint16_to_int16
+ *
+ * Description: Cast uint16 value to int16
+ *
+ * Returns:
+ *   input x in     0 .. 32767: returns value unchanged
+ *   input x in 32768 .. 65535: returns (x - 65536)
+ **************************************************/
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+ALWAYS_INLINE
+static INLINE int16_t cast_uint16_to_int16(uint16_t x)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 4)
-    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[4];
-    uint8_t const *base = &a[5 * j];
-
-    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
-    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
-    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
-
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  /*
+   * PORTABILITY: This relies on uint16_t -> int16_t
+   * being implemented as the inverse of int16_t -> uint16_t,
+   * which is implementation-defined (C99 6.3.1.3 (3))
+   * CBMC (correctly) fails to prove this conversion is OK,
+   * so we have to suppress that check here
+   */
+  return (int16_t)x;
 }
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+/*************************************************
+ * Name:        montgomery_reduce_generic
+ *
+ * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
+ *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
+ *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
+ *
+ **************************************************/
+ALWAYS_INLINE
+static INLINE int16_t montgomery_reduce_generic(int32_t a)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
+  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 32)))
-    {
-      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
-    }
+  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
+  const uint16_t a_reduced = a & UINT16_MAX;
+  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
 
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC, and use array indexing into
-     * r rather than pointer-arithmetic to simplify verification
-     */
-    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
-    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
-    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
-    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
-    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
-  }
-}
+  /* Lift to signed canonical representative mod 2^16. */
+  const int16_t t = cast_uint16_to_int16(a_inverted);
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  int32_t r = a - ((int32_t)t * MLKEM_Q);
+  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
 
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j <= MLKEM_N / 8))
-  {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
-    {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
-    }
+  /*
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  r = r >> 16;
+  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
+   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
+   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *
+   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
+   */
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
-  }
+  return (int16_t)r;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+/*************************************************
+ * Name:        montgomery_reduce
+ *
+ * Description: Montgomery reduction
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q,
+ *              smaller than 2 * q in absolute value.
+ **************************************************/
+static INLINE int16_t montgomery_reduce(int32_t a)
+__contract__(
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
+  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
+)
 {
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    uint8_t t[8];
-    const unsigned offset = i * 5;
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC and unwind loop for ease
-     * of proof.
-     */
-
-    /*
-     * Decompress 5 8-bit bytes (so 40 bits) into
-     * 8 5-bit values stored in t[]
-     */
-    t[0] = 0x1F & (a[offset + 0] >> 0);
-    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
-    t[2] = 0x1F & (a[offset + 1] >> 2);
-    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
-    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
-    t[5] = 0x1F & (a[offset + 3] >> 1);
-    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
-    t[7] = 0x1F & (a[offset + 4] >> 3);
-
-    /* and copy to the correct slice in r[] */
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(j <= 8 && i <= MLKEM_N / 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  int16_t res;
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
+
+  res = montgomery_reduce_generic(a);
+  /* Bounds:
+   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
+   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
+   *        < 2 * MLKEM_Q */
+
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
+  return res;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+    !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        fqmul
+ *
+ * Description: Montgomery multiplication modulo q=3329
+ *
+ * Arguments:   - int16_t a: first factor
+ *                  Can be any int16_t.
+ *              - int16_t b: second factor.
+ *                  Must be signed canonical (abs value <(q+1)/2)
+ *
+ * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
+ * smaller than q in absolute value.
+ *
+ **************************************************/
+static INLINE int16_t fqmul(int16_t a, int16_t b)
+__contract__(
+  requires(b > -HALF_Q)
+  requires(b < HALF_Q)
+  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
+)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
-  }
+  int16_t res;
+  debug_assert_abs_bound(&b, 1, HALF_Q);
+
+  res = montgomery_reduce((int32_t)a * (int32_t)b);
+  /* Bounds:
+   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
+   *        < MLKEM_Q
+   */
 
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
+  return res;
 }
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
-
-#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+          !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+          !defined(MLKEM_USE_NATIVE_NTT) ||                   \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        barrett_reduce
+ *
+ * Description: Barrett reduction; given a 16-bit integer a, computes
+ *              centered representative congruent to a mod q in
+ *              {-(q-1)/2,...,(q-1)/2}
+ *
+ * Arguments:   - int16_t a: input integer to be reduced
+ *
+ * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
+ **************************************************/
+static INLINE int16_t barrett_reduce(int16_t a)
+__contract__(
+  ensures(return_value > -HALF_Q && return_value < HALF_Q)
+)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i <= MLKEM_N / 2))
-  {
-    const uint16_t t0 = a->coeffs[2 * i];
-    const uint16_t t1 = a->coeffs[2 * i + 1];
-    /*
-     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
-     * significant data, so these can be packed into 24 bits or exactly
-     * 3 bytes, as follows.
-     */
-
-    /* Least significant bits 0 - 7 of t0. */
-    r[3 * i + 0] = t0 & 0xFF;
-
-    /*
-     * Most significant bits 8 - 11 of t0 become the least significant
-     * nibble of the second byte. The least significant 4 bits
-     * of t1 become the upper nibble of the second byte.
-     */
-    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+  /*
+   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
+   * multiplier is round_to_nearest(2**26/MLKEM_Q)
+   */
+  const int BPOWER = 26;
+  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
 
-    /* Bits 4 - 11 of t1 become the third byte. */
-    r[3 * i + 2] = t1 >> 4;
-  }
-}
-#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-{
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  poly_tobytes_native(r, a);
-}
-#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+  /*
+   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
+   * above and shift by BPOWER places.
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
 
-#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
-  {
-    const uint8_t t0 = a[3 * i + 0];
-    const uint8_t t1 = a[3 * i + 1];
-    const uint8_t t2 = a[3 * i + 2];
-    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
-    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
-  }
+  /*
+   * t is in -10 .. +10, so we need 32-bit math to
+   * evaluate t * MLKEM_Q and the subsequent subtraction
+   */
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
 
-  /* Note that the coefficients are not canonical */
-  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
-}
-#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  poly_frombytes_native(r, a);
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
-#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
+                           int16_t b_cached)
+__contract__(
+  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
+  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
+  assigns(memory_slice(r, 2 * sizeof(int16_t)))
+  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)))
 {
-  unsigned i;
-#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
-#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
-#endif
+  int32_t t0, t1;
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <  MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      /* Prevent the compiler from recognizing this as a bit selection */
-      uint8_t mask = value_barrier_u8(1u << j);
-      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
-    }
-  }
-  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
-}
+  t0 = (int32_t)a[1] * b_cached;
+  t0 += (int32_t)a[0] * b[0];
+  t1 = (int32_t)a[0] * b[1];
+  t1 += (int32_t)a[1] * b[0];
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* |ti| < 2 * q * 2^15 */
+  r[0] = montgomery_reduce(t0);
+  r[1] = montgomery_reduce(t1);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    msg[i] = 0;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8))
-    {
-      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
-      msg[i] |= t << j;
-    }
-  }
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -434,12 +292,46 @@ void poly_tomont(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
-  poly_tomont_native(r);
+  poly_tomont_native(r->coeffs);
   debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
 #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/************************************************************
+ * Name: scalar_signed_to_unsigned_q
+ *
+ * Description: converts signed polynomial coefficient
+ *              from signed (-3328 .. 3328) form to
+ *              unsigned form (0 .. 3328).
+ *
+ * Note: Cryptographic constant time implementation
+ *
+ * Examples:       0 -> 0
+ *                 1 -> 1
+ *              3328 -> 3328
+ *                -1 -> 3328
+ *                -2 -> 3327
+ *             -3328 -> 1
+ *
+ * Arguments: c: signed coefficient to be converted
+ ************************************************************/
+static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
+__contract__(
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
+  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
+{
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
+  /* Add Q if c is negative, but in constant time */
+  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
+
+  /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
+  return (uint16_t)c;
+}
+
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
@@ -461,7 +353,7 @@ void poly_reduce(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
-  poly_reduce_native(r);
+  poly_reduce_native(r->coeffs);
   debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
@@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
-  poly_mulcache_compute_native(x, a);
+  poly_mulcache_compute_native(x->coeffs, a->coeffs);
   /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 
+#if !defined(MLKEM_USE_NATIVE_NTT)
+/*
+ * Computes a block CT butterflies with a fixed twiddle factor,
+ * using Montgomery multiplication.
+ * Parameters:
+ * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
+ * - root: Twiddle factor to use for the butterfly. This must be in
+ *         Montgomery form and signed canonical.
+ * - start: Offset to the beginning of the butterfly block
+ * - len: Index difference between coefficients subject to a butterfly
+ * - bound: Ghost variable describing coefficient bound: Prior to `start`,
+ *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
+ *          they must be bound by `bound`.
+ * When this function returns, output coefficients in the index range
+ * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
+ * Example:
+ * - start=8, len=4
+ *   This would compute the following four butterflies
+ *          8     --    12
+ *             9    --     13
+ *                10   --     14
+ *                   11   --     15
+ * - start=4, len=2
+ *   This would compute the following two butterflies
+ *          4 -- 6
+ *             5 -- 7
+ */
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
+__contract__(
+  requires(start < MLKEM_N)
+  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
+  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
+  requires(-HALF_Q < zeta && zeta < HALF_Q)
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
+  requires(array_abs_bound(r, start, MLKEM_N, bound))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
+  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
+{
+  /* `bound` is a ghost variable only needed in the CBMC specification */
+  unsigned j;
+  ((void)bound);
+  for (j = start; j < start + len; j++)
+  __loop__(
+    invariant(start <= j && j <= start + len)
+    /*
+     * Coefficients are updated in strided pairs, so the bounds for the
+     * intermediate states alternate twice between the old and new bound
+     */
+    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j,           start + len, bound))
+    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
+  {
+    int16_t t;
+    t = fqmul(r[j + len], zeta);
+    r[j + len] = r[j] - t;
+    r[j] = r[j] + t;
+  }
+}
+
+/*
+ *Compute one layer of forward NTT
+ * Parameters:
+ * - r: Pointer to base of polynomial
+ * - len: Stride of butterflies in this layer.
+ * - layer: Ghost variable indicating which layer is being applied.
+ *          Must match `len` via `len == MLKEM_N >> layer`.
+ * Note: `len` could be dropped and computed in the function, but
+ *   we are following the structure of the reference NTT from the
+ *   official Kyber implementation here, merely adding `layer` as
+ *   a ghost variable for the specifications.
+ */
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
+  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable only needed in the CBMC specification */
+  ((void)layer);
+  /* Twiddle factors for layer n start at index 2^(layer-1) */
+  k = MLKEM_N / (2 * len);
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
+    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
+  {
+    int16_t zeta = zetas[k++];
+    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
+  }
+}
+
+/*
+ * Compute full forward NTT
+ * NOTE: This particular implementation satisfies a much tighter
+ * bound on the output coefficients (5*q) than the contractual one (8*q),
+ * but this is not needed in the calling code. Should we change the
+ * base multiplication strategy to require smaller NTT output bounds,
+ * the proof may need strengthening.
+ */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  unsigned len, layer;
+  int16_t *r;
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  r = p->coeffs;
+
+  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
+  __loop__(
+    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
+    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
+  {
+    ntt_layer(r, len, layer);
+  }
+
+  /* Check the stronger bound */
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_NTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  ntt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if !defined(MLKEM_USE_NATIVE_INTT)
+
+/* Compute one layer of inverse NTT */
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
+  requires(len == (1 << (8 - layer)))
+  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable used only in the specification */
+  ((void)layer);
+  k = MLKEM_N / len - 1;
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+    invariant(start <= MLKEM_N && k <= 127)
+    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
+    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
+  {
+    unsigned j;
+    int16_t zeta = zetas[k--];
+    for (j = start; j < start + len; j++)
+    __loop__(
+      invariant(start <= j && j <= start + len)
+      invariant(start <= MLKEM_N && k <= 127)
+      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+    {
+      int16_t t = r[j];
+      r[j] = barrett_reduce(t + r[j + len]);
+      r[j + len] = r[j + len] - t;
+      r[j + len] = fqmul(r[j + len], zeta);
+    }
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  /*
+   * Scale input polynomial to account for Montgomery factor
+   * and NTT twist. This also brings coefficients down to
+   * absolute value < MLKEM_Q.
+   */
+  unsigned j, len, layer;
+  const int16_t f = 1441;
+  int16_t *r = p->coeffs;
+
+  for (j = 0; j < MLKEM_N; j++)
+  __loop__(
+    invariant(j <= MLKEM_N)
+    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
+  {
+    r[j] = fqmul(r[j], f);
+  }
+
+  /* Run the invNTT layers */
+  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
+  __loop__(
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+  {
+    invntt_layer(p->coeffs, len, layer);
+  }
+
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_INTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  intt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_INTT */
+
 #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h
index 6a14c785d..cb0d67c1a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly.h
@@ -9,7 +9,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "reduce.h"
+#include "debug.h"
 #include "verify.h"
 
 /* Absolute exclusive upper bound for the output of the inverse NTT */
@@ -18,6 +18,9 @@
 /* Absolute exclusive upper bound for the output of the forward NTT */
 #define NTT_BOUND (8 * MLKEM_Q)
 
+#define zetas MLKEM_NAMESPACE(zetas)
+extern const int16_t zetas[128];
+
 /*
  * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
  * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
@@ -38,520 +41,6 @@ typedef struct
   int16_t coeffs[MLKEM_N >> 1];
 } poly_mulcache;
 
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
-#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
-#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
-#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
-#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
-#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
-#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
-#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
-#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
-#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
-/* End of static namespacing */
-
-/************************************************************
- * Name: scalar_compress_d1
- *
- * Description: Computes round(u * 2 / q)
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 1.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d1(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 2)
-  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
-{
-  uint32_t d0 = u << 1;
-  d0 *= 645083;
-  d0 += 1u << 30;
-  d0 >>= 31;
-  return d0;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_compress_d4
- *
- * Description: Computes round(u * 16 / q) % 16
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d4(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 16)
-  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
-{
-  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
-  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d4
- *
- * Description: Computes round(u * q / 16)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d4(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 16)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 8) / 16; }
-
-/************************************************************
- * Name: scalar_compress_d5
- *
- * Description: Computes round(u * 32 / q) % 32
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d5(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 32)
-  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
-{
-  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
-  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d5
- *
- * Description: Computes round(u * q / 32)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 32
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d5(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 32)
-  ensures(return_value <= MLKEM_Q - 1)
-) { return ((u * MLKEM_Q) + 16) / 32; }
-
-/************************************************************
- * Name: scalar_compress_d10
- *
- * Description: Computes round(u * 2**10 / q) % 2**10
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d10(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 10))
-  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
-{
-  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x3FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d10
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d10(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 1024)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 512) / 1024; }
-
-/************************************************************
- * Name: scalar_compress_d11
- *
- * Description: Computes round(u * 2**11 / q) % 2**11
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 11.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d11(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 11))
-  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
-{
-  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x7FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d11
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d11(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 2048)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 1024) / 2048; }
-
-/************************************************************
- * Name: scalar_signed_to_unsigned_q
- *
- * Description: converts signed polynomial coefficient
- *              from signed (-3328 .. 3328) form to
- *              unsigned form (0 .. 3328).
- *
- * Note: Cryptographic constant time implementation
- *
- * Examples:       0 -> 0
- *                 1 -> 1
- *              3328 -> 3328
- *                -1 -> 3328
- *                -2 -> 3327
- *             -3328 -> 1
- *
- * Arguments: c: signed coefficient to be converted
- ************************************************************/
-static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
-__contract__(
-  requires(c > -MLKEM_Q && c < MLKEM_Q)
-  ensures(return_value >= 0 && return_value < MLKEM_Q)
-  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
-{
-  debug_assert_abs_bound(&c, 1, MLKEM_Q);
-
-  /* Add Q if c is negative, but in constant time */
-  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
-
-  /* and therefore cast to uint16_t is safe. */
-  debug_assert_bound(&c, 1, 0, MLKEM_Q);
-  return (uint16_t)c;
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
-    (MLKEM_K == 2 || MLKEM_K == 3)
-#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
-/*************************************************
- * Name:        poly_compress_d4
- *
- * Description: Compression (4 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
-
-#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
-/*************************************************
- * Name:        poly_compress_d10
- *
- * Description: Compression (10 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
-
-#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
-/*************************************************
- * Name:        poly_decompress_d4
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
-
-#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
-/*************************************************
- * Name:        poly_decompress_d10
- *
- * Description: De-serialization and subsequent decompression (10 bits) of a
- *              polynomial; approximate inverse of poly_compress_d10
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
-/*************************************************
- * Name:        poly_compress_d5
- *
- * Description: Compression (5 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
-
-#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
-/*************************************************
- * Name:        poly_compress_d11
- *
- * Description: Compression (11 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
-
-#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
-/*************************************************
- * Name:        poly_decompress_d5
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
-
-#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
-/*************************************************
- * Name:        poly_decompress_d11
- *
- * Description: De-serialization and subsequent decompression (11 bits) of a
- *              polynomial; approximate inverse of poly_compress_d11
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
-        */
-
-#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
-/*************************************************
- * Name:        poly_tobytes
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - a: const pointer to input polynomial,
- *                with each coefficient in the range [0,1,..,Q-1]
- *              OUTPUT
- *              - r: pointer to output byte array
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYBYTES))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
-
-
-#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
-/*************************************************
- * Name:        poly_frombytes
- *
- * Description: De-serialization of a polynomial.
- *
- * Arguments:   INPUT
- *              - a: pointer to input byte array
- *                   (of MLKEM_POLYBYTES bytes)
- *              OUTPUT
- *              - r: pointer to output polynomial, with
- *                   each coefficient unsigned and in the range
- *                   0 .. 4095
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
-);
-
-
-#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
-/*************************************************
- * Name:        poly_frommsg
- *
- * Description: Convert 32-byte message to polynomial
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *msg: pointer to input message
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
-
-#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
-/*************************************************
- * Name:        poly_tomsg
- *
- * Description: Convert polynomial to 32-byte message
- *
- * Arguments:   - uint8_t *msg: pointer to output message
- *              - const poly *r: pointer to input polynomial
- *                Coefficients must be unsigned canonical
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(msg))
-);
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -715,4 +204,56 @@ __contract__(
   assigns(object_whole(r))
 );
 
+#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
+/*************************************************
+ * Name:        poly_ntt
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ *              (NOTE: Sometimes the input to the NTT is actually smaller,
+ *               which gives better bounds.)
+ *
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
+);
+
+#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
+/*************************************************
+ * Name:        poly_invntt_tomont
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place;
+ *              inputs assumed to be in bitreversed order, output in normal
+ *              order
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
+);
+
 #endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly_k.c
new file mode 100644
index 000000000..c2d330ea9
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly_k.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "poly_k.h"
+#include <stdint.h>
+#include <string.h>
+#include "arith_backend.h"
+#include "compress.h"
+#include "sampling.h"
+#include "symmetric.h"
+
+#include "debug.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
+#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
+/* End of static namespacing */
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+{
+  unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
+  }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+{
+  unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
+  }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_ntt(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_ntt(&r->vec[i]);
+  }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_invntt_tomont(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_invntt_tomont(&r->vec[i]);
+  }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
+}
+
+#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  unsigned i;
+  poly t;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+
+  poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
+  for (i = 1; i < MLKEM_K; i++)
+  {
+    poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
+                                   &b_cache->vec[i]);
+    poly_add(r, &t);
+  }
+
+  /*
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
+   */
+  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
+}
+#else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+  /* Omitting bounds assertion for cache since native implementations may
+   * decide not to use a mulcache. Note that the C backend implementation
+   * of poly_basemul_montgomery_cached() does still include the check. */
+  polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a,
+                                               (const int16_t *)b,
+                                               (const int16_t *)b_cache);
+}
+#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
+{
+  polyvec_mulcache b_cache;
+  polyvec_mulcache_compute(&b_cache, b);
+  polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_mulcache_compute(&x->vec[i], &a->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_reduce(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_reduce(&r->vec[i]);
+  }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_add(polyvec *r, const polyvec *b)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_add(&r->vec[i], &b->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tomont(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tomont(&r->vec[i]);
+  }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+}
+
+
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta1(poly *r,
+                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
+)
+{
+#if MLKEM_ETA1 == 2
+  poly_cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  poly_cbd3(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA1"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
+  memcpy(extkey0, seed, MLKEM_SYMBYTES);
+  memcpy(extkey1, seed, MLKEM_SYMBYTES);
+  memcpy(extkey2, seed, MLKEM_SYMBYTES);
+  memcpy(extkey3, seed, MLKEM_SYMBYTES);
+  extkey0[MLKEM_SYMBYTES] = nonce0;
+  extkey1[MLKEM_SYMBYTES] = nonce1;
+  extkey2[MLKEM_SYMBYTES] = nonce2;
+  extkey3[MLKEM_SYMBYTES] = nonce3;
+  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
+  poly_cbd_eta1(r0, buf0);
+  poly_cbd_eta1(r1, buf1);
+  poly_cbd_eta1(r2, buf2);
+  poly_cbd_eta1(r3, buf3);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+}
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+/*************************************************
+ * Name:        poly_cbd_eta2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta2(poly *r,
+                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
+{
+#if MLKEM_ETA2 == 2
+  poly_cbd2(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA2"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, seed, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+  prf_eta2(buf, extkey);
+
+  poly_cbd_eta2(r, buf);
+
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+}
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+
+#if MLKEM_K == 2
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+  prf_eta1(buf1[0], extkey[0]);
+  prf_eta1(buf1[1], extkey[1]);
+  prf_eta2(buf2[0], extkey[2]);
+  prf_eta2(buf2[1], extkey[3]);
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
+}
+#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly_k.h
new file mode 100644
index 000000000..0aea95912
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/poly_k.h
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef POLY_K_H
+#define POLY_K_H
+
+#include <stdint.h>
+#include "common.h"
+#include "compress.h"
+#include "poly.h"
+
+#define polyvec MLKEM_NAMESPACE_K(polyvec)
+typedef struct
+{
+  poly vec[MLKEM_K];
+} ALIGN polyvec;
+
+#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
+typedef struct
+{
+  poly_mulcache vec[MLKEM_K];
+} polyvec_mulcache;
+
+#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
+{
+#if MLKEM_DU == 10
+  poly_compress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_compress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *              polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_du(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DU == 10
+  poly_decompress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_decompress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r)))
+{
+#if MLKEM_DV == 4
+  poly_compress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_compress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+
+#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_dv(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DV == 4
+  poly_decompress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_decompress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
+/*************************************************
+ * Name:        polyvec_compress_du
+ *
+ * Description: Compress and serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ *              - const polyvec *a: pointer to input vector of polynomials.
+ *                                  Coefficients must be unsigned canonical,
+ *                                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(forall(k0, 0, MLKEM_K,
+         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
+/*************************************************
+ * Name:        polyvec_decompress_du
+ *
+ * Description: De-serialize and decompress vector of polynomials;
+ *              approximate inverse of polyvec_compress_du
+ *
+ * Arguments:   - polyvec *r:       pointer to output vector of polynomials.
+ *                Output will have coefficients normalized to [0,..,q-1].
+ *              - const uint8_t *a: pointer to input byte array
+ *                                  (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(k0, 0, MLKEM_K,
+         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+);
+
+#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
+/*************************************************
+ * Name:        polyvec_tobytes
+ *
+ * Description: Serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECBYTES)
+ *              - const polyvec *a: pointer to input vector of polynomials
+ *                  Each polynomial must have coefficients in [0,..,q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+__contract__(
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
+  requires(forall(k0, 0, MLKEM_K,
+         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
+/*************************************************
+ * Name:        polyvec_frombytes
+ *
+ * Description: De-serialize vector of polynomials;
+ *              inverse of polyvec_tobytes
+ *
+ * Arguments:   - const polyvec *a: pointer to output vector of polynomials
+ *                 (of length MLKEM_POLYVECBYTES). Output will have coefficients
+ *                 normalized in [0..4095].
+ *              - uint8_t *r: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
+  assigns(object_whole(r))
+  ensures(forall(k0, 0, MLKEM_K,
+        array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
+);
+
+#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
+/*************************************************
+ * Name:        polyvec_ntt
+ *
+ * Description: Apply forward NTT to all elements of a vector of polynomials.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_ntt(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+  assigns(object_whole(r))
+  ensures(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
+);
+
+#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
+/*************************************************
+ * Name:        polyvec_invntt_tomont
+ *
+ * Description: Apply inverse NTT to all elements of a vector of polynomials
+ *              and multiply by Montgomery factor 2^16
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_invntt_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND)))
+);
+
+#define polyvec_basemul_acc_montgomery \
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery
+ *
+ * Description: Multiply elements of a and b in NTT domain, accumulate into r,
+ *              and multiply by 2^-16.
+ *
+ * Arguments: - poly *r: pointer to output polynomial
+ *            - const polyvec *a: pointer to first input vector of polynomials
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(forall(k1, 0, MLKEM_K,
+    array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+
+#define polyvec_basemul_acc_montgomery_cached \
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery_cached
+ *
+ * Description: Scalar product of two vectors of polynomials in NTT domain,
+ *              using mulcache for second operand.
+ *
+ *              Bounds:
+ *              - Every coefficient of a is assumed to be in [0..4095]
+ *              - No bounds guarantees for the coefficients in the result.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const polyvec *a: pointer to first input polynomial vector
+ *              - const polyvec *b: pointer to second input polynomial vector
+ *              - const polyvec_mulcache *b_cache: pointer to mulcache
+ *                  for second input polynomial vector. Can be computed
+ *                  via polyvec_mulcache_compute().
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache)))
+  requires(forall(k1, 0, MLKEM_K,
+     array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
+/************************************************************
+ * Name: polyvec_mulcache_compute
+ *
+ * Description: Computes the mulcache for a vector of polynomials in NTT domain
+ *
+ *              The mulcache of a degree-2 polynomial b := b0 + b1*X
+ *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
+ *              computing products of b in Fq[X]/(X^2-zeta).
+ *
+ *              The mulcache of a polynomial in NTT domain -- which is
+ *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
+ *              for varying zeta, is the 128-tuple of mulcaches of those
+ *              polynomials.
+ *
+ *              The mulcache of a vector of polynomials is the vector
+ *              of mulcaches of its entries.
+ *
+ * Arguments: - x: Pointer to mulcache to be populated
+ *            - a: Pointer to input polynomial vector
+ ************************************************************/
+/*
+ * NOTE: The default C implementation of this function populates
+ * the mulcache with values in (-q,q), but this is not needed for the
+ * higher level safety proofs, and thus not part of the spec.
+ */
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+__contract__(
+  requires(memory_no_alias(x, sizeof(polyvec_mulcache)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  assigns(object_whole(x))
+);
+
+#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
+/*************************************************
+ * Name:        polyvec_reduce
+ *
+ * Description: Applies Barrett reduction to each coefficient
+ *              of each element of a vector of polynomials;
+ *              for details of the Barrett reduction see comments in reduce.c
+ *
+ * Arguments:   - polyvec *r: pointer to input/output polynomial
+ **************************************************/
+/*
+ * NOTE: The semantics of polyvec_reduce() is different in
+ *       the reference implementation, which requires
+ *       signed canonical output data. Unsigned canonical
+ *       outputs are better suited to the only remaining
+ *       use of poly_reduce() in the context of (de)serialization.
+ */
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_reduce(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(k0, 0, MLKEM_K,
+    array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+);
+
+#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
+/*************************************************
+ * Name:        polyvec_add
+ *
+ * Description: Add vectors of polynomials
+ *
+ * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be
+ *              added to
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ *
+ * The coefficients of r and b must be so that the addition does
+ * not overflow. Otherwise, the behaviour of this function is undefined.
+ *
+ * The coefficients returned in *r are in int16_t which is sufficient
+ * to prove type-safety of calling units. Therefore, no stronger
+ * ensures clause is required on this function.
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_add(polyvec *r, const polyvec *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(forall(j0, 0, MLKEM_K,
+          forall(k0, 0, MLKEM_N,
+            (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
+  requires(forall(j1, 0, MLKEM_K,
+          forall(k1, 0, MLKEM_N,
+            (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
+/*************************************************
+ * Name:        polyvec_tomont
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              vector from normal domain to Montgomery domain
+ *
+ *              Bounds: Output < q in absolute value.
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(memory_slice(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(j, 0, MLKEM_K,
+    array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+);
+
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+);
+#endif /* MLKEM_K == 2 */
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.c
deleted file mode 100644
index 50ea1c34a..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.c
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "polyvec.h"
-#include <stdint.h>
-#include <string.h>
-#include "arith_backend.h"
-#include "cbd.h"
-#include "ntt.h"
-#include "poly.h"
-#include "symmetric.h"
-
-#include "debug.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
-#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
-/* End of static namespacing */
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
-                         const polyvec *a)
-{
-  unsigned i;
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_decompress_du(polyvec *r,
-                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
-  }
-
-  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
-{
-  unsigned i;
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
-  }
-
-  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_ntt(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_ntt(&r->vec[i]);
-  }
-
-  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_invntt_tomont(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_invntt_tomont(&r->vec[i]);
-  }
-
-  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
-}
-
-#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
-                                           const polyvec *b,
-                                           const polyvec_mulcache *b_cache)
-{
-  unsigned i;
-  poly t;
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
-
-  poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
-  for (i = 1; i < MLKEM_K; i++)
-  {
-    poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
-                                   &b_cache->vec[i]);
-    poly_add(r, &t);
-  }
-
-  /*
-   * This bound is true for the C implementation, but not needed
-   * in the higher level bounds reasoning. It is thus omitted
-   * them from the spec to not unnecessarily constrain native
-   * implementations, but checked here nonetheless.
-   */
-  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
-}
-#else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
-                                           const polyvec *b,
-                                           const polyvec_mulcache *b_cache)
-{
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
-  /* Omitting bounds assertion for cache since native implementations may
-   * decide not to use a mulcache. Note that the C backend implementation
-   * of poly_basemul_montgomery_cached() does still include the check. */
-  polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
-}
-#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-{
-  polyvec_mulcache b_cache;
-  polyvec_mulcache_compute(&b_cache, b);
-  polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_mulcache_compute(&x->vec[i], &a->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_reduce(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_reduce(&r->vec[i]);
-  }
-
-  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_add(polyvec *r, const polyvec *b)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_add(&r->vec[i], &b->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tomont(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_tomont(&r->vec[i]);
-  }
-
-  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
-}
-
-
-/*************************************************
- * Name:        poly_cbd_eta1
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static INLINE void poly_cbd_eta1(poly *r,
-                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
-)
-{
-#if MLKEM_ETA1 == 2
-  poly_cbd2(r, buf);
-#elif MLKEM_ETA1 == 3
-  poly_cbd3(r, buf);
-#else
-#error "Invalid value of MLKEM_ETA1"
-#endif
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-{
-  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
-  memcpy(extkey0, seed, MLKEM_SYMBYTES);
-  memcpy(extkey1, seed, MLKEM_SYMBYTES);
-  memcpy(extkey2, seed, MLKEM_SYMBYTES);
-  memcpy(extkey3, seed, MLKEM_SYMBYTES);
-  extkey0[MLKEM_SYMBYTES] = nonce0;
-  extkey1[MLKEM_SYMBYTES] = nonce1;
-  extkey2[MLKEM_SYMBYTES] = nonce2;
-  extkey3[MLKEM_SYMBYTES] = nonce3;
-  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
-  poly_cbd_eta1(r0, buf0);
-  poly_cbd_eta1(r1, buf1);
-  poly_cbd_eta1(r2, buf2);
-  poly_cbd_eta1(r3, buf3);
-
-  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
-}
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-/*************************************************
- * Name:        poly_cbd_eta2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA2.
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static INLINE void poly_cbd_eta2(poly *r,
-                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
-{
-#if MLKEM_ETA2 == 2
-  poly_cbd2(r, buf);
-#else
-#error "Invalid value of MLKEM_ETA2"
-#endif
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-{
-  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
-
-  memcpy(extkey, seed, MLKEM_SYMBYTES);
-  extkey[MLKEM_SYMBYTES] = nonce;
-  prf_eta2(buf, extkey);
-
-  poly_cbd_eta2(r, buf);
-
-  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-
-#if MLKEM_K == 2
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
-  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
-  extkey[0][MLKEM_SYMBYTES] = nonce0;
-  extkey[1][MLKEM_SYMBYTES] = nonce1;
-  extkey[2][MLKEM_SYMBYTES] = nonce2;
-  extkey[3][MLKEM_SYMBYTES] = nonce3;
-
-  prf_eta1(buf1[0], extkey[0]);
-  prf_eta1(buf1[1], extkey[1]);
-  prf_eta2(buf2[0], extkey[2]);
-  prf_eta2(buf2[1], extkey[3]);
-
-  poly_cbd_eta1(r0, buf1[0]);
-  poly_cbd_eta1(r1, buf1[1]);
-  poly_cbd_eta2(r2, buf2[0]);
-  poly_cbd_eta2(r3, buf2[1]);
-
-  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
-  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
-}
-#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.h
deleted file mode 100644
index 8be8579e0..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/polyvec.h
+++ /dev/null
@@ -1,595 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef POLYVEC_H
-#define POLYVEC_H
-
-#include <stdint.h>
-#include "common.h"
-#include "poly.h"
-
-#define polyvec MLKEM_NAMESPACE_K(polyvec)
-typedef struct
-{
-  poly vec[MLKEM_K];
-} ALIGN polyvec;
-
-#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
-typedef struct
-{
-  poly_mulcache vec[MLKEM_K];
-} polyvec_mulcache;
-
-#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
-/*************************************************
- * Name:        poly_compress_du
- *
- * Description: Compression (du bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
-                                    const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
-{
-#if MLKEM_DU == 10
-  poly_compress_d10(r, a);
-#elif MLKEM_DU == 11
-  poly_compress_d11(r, a);
-#else
-#error "Invalid value of MLKEM_DU"
-#endif
-}
-
-#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
-/*************************************************
- * Name:        poly_decompress_du
- *
- * Description: De-serialization and subsequent decompression (du bits) of a
- *              polynomial; approximate inverse of poly_compress_du
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-static INLINE void poly_decompress_du(
-    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-#if MLKEM_DU == 10
-  poly_decompress_d10(r, a);
-#elif MLKEM_DU == 11
-  poly_decompress_d11(r, a);
-#else
-#error "Invalid value of MLKEM_DU"
-#endif
-}
-
-#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
-/*************************************************
- * Name:        poly_compress_dv
- *
- * Description: Compression (dv bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
-                                    const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r)))
-{
-#if MLKEM_DV == 4
-  poly_compress_d4(r, a);
-#elif MLKEM_DV == 5
-  poly_compress_d5(r, a);
-#else
-#error "Invalid value of MLKEM_DV"
-#endif
-}
-
-
-#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
-/*************************************************
- * Name:        poly_decompress_dv
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-static INLINE void poly_decompress_dv(
-    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-#if MLKEM_DV == 4
-  poly_decompress_d4(r, a);
-#elif MLKEM_DV == 5
-  poly_decompress_d5(r, a);
-#else
-#error "Invalid value of MLKEM_DV"
-#endif
-}
-
-#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
-/*************************************************
- * Name:        polyvec_compress_du
- *
- * Description: Compress and serialize vector of polynomials
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU)
- *              - const polyvec *a: pointer to input vector of polynomials.
- *                                  Coefficients must be unsigned canonical,
- *                                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
-                         const polyvec *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(forall(k0, 0, MLKEM_K,
-         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-  assigns(object_whole(r))
-);
-
-#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
-/*************************************************
- * Name:        polyvec_decompress_du
- *
- * Description: De-serialize and decompress vector of polynomials;
- *              approximate inverse of polyvec_compress_du
- *
- * Arguments:   - polyvec *r:       pointer to output vector of polynomials.
- *                Output will have coefficients normalized to [0,..,q-1].
- *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU)
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_decompress_du(polyvec *r,
-                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(k0, 0, MLKEM_K,
-         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-);
-
-#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
-/*************************************************
- * Name:        polyvec_tobytes
- *
- * Description: Serialize vector of polynomials
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (needs space for MLKEM_POLYVECBYTES)
- *              - const polyvec *a: pointer to input vector of polynomials
- *                  Each polynomial must have coefficients in [0,..,q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
-__contract__(
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
-  requires(forall(k0, 0, MLKEM_K,
-         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-  assigns(object_whole(r))
-);
-
-#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
-/*************************************************
- * Name:        polyvec_frombytes
- *
- * Description: De-serialize vector of polynomials;
- *              inverse of polyvec_tobytes
- *
- * Arguments:   - const polyvec *a: pointer to output vector of polynomials
- *                 (of length MLKEM_POLYVECBYTES). Output will have coefficients
- *                 normalized in [0..4095].
- *              - uint8_t *r: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
-  assigns(object_whole(r))
-  ensures(forall(k0, 0, MLKEM_K,
-        array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
-);
-
-#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
-/*************************************************
- * Name:        polyvec_ntt
- *
- * Description: Apply forward NTT to all elements of a vector of polynomials.
- *
- *              The input is assumed to be in normal order and
- *              coefficient-wise bound by MLKEM_Q in absolute value.
- *
- *              The output polynomial is in bitreversed order, and
- *              coefficient-wise bound by NTT_BOUND in absolute value.
- *
- * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_ntt(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  requires(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
-  assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
-);
-
-#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
-/*************************************************
- * Name:        polyvec_invntt_tomont
- *
- * Description: Apply inverse NTT to all elements of a vector of polynomials
- *              and multiply by Montgomery factor 2^16
- *
- *              The input is assumed to be in bitreversed order, and can
- *              have arbitrary coefficients in int16_t.
- *
- *              The output polynomial is in normal order, and
- *              coefficient-wise bound by INVNTT_BOUND in absolute value.
- *
- *
- * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_invntt_tomont(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND)))
-);
-
-#define polyvec_basemul_acc_montgomery \
-  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
-/*************************************************
- * Name:        polyvec_basemul_acc_montgomery
- *
- * Description: Multiply elements of a and b in NTT domain, accumulate into r,
- *              and multiply by 2^-16.
- *
- * Arguments: - poly *r: pointer to output polynomial
- *            - const polyvec *a: pointer to first input vector of polynomials
- *            - const polyvec *b: pointer to second input vector of polynomials
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(memory_no_alias(b, sizeof(polyvec)))
-  requires(forall(k1, 0, MLKEM_K,
-    array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
-  assigns(memory_slice(r, sizeof(poly)))
-);
-
-
-#define polyvec_basemul_acc_montgomery_cached \
-  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
-/*************************************************
- * Name:        polyvec_basemul_acc_montgomery_cached
- *
- * Description: Scalar product of two vectors of polynomials in NTT domain,
- *              using mulcache for second operand.
- *
- *              Bounds:
- *              - Every coefficient of a is assumed to be in [0..4095]
- *              - No bounds guarantees for the coefficients in the result.
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const polyvec *a: pointer to first input polynomial vector
- *              - const polyvec *b: pointer to second input polynomial vector
- *              - const polyvec_mulcache *b_cache: pointer to mulcache
- *                  for second input polynomial vector. Can be computed
- *                  via polyvec_mulcache_compute().
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
-                                           const polyvec *b,
-                                           const polyvec_mulcache *b_cache)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(memory_no_alias(b, sizeof(polyvec)))
-  requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache)))
-  requires(forall(k1, 0, MLKEM_K,
-     array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
-  assigns(memory_slice(r, sizeof(poly)))
-);
-
-#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
-/************************************************************
- * Name: polyvec_mulcache_compute
- *
- * Description: Computes the mulcache for a vector of polynomials in NTT domain
- *
- *              The mulcache of a degree-2 polynomial b := b0 + b1*X
- *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
- *              computing products of b in Fq[X]/(X^2-zeta).
- *
- *              The mulcache of a polynomial in NTT domain -- which is
- *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
- *              for varying zeta, is the 128-tuple of mulcaches of those
- *              polynomials.
- *
- *              The mulcache of a vector of polynomials is the vector
- *              of mulcaches of its entries.
- *
- * Arguments: - x: Pointer to mulcache to be populated
- *            - a: Pointer to input polynomial vector
- ************************************************************/
-/*
- * NOTE: The default C implementation of this function populates
- * the mulcache with values in (-q,q), but this is not needed for the
- * higher level safety proofs, and thus not part of the spec.
- */
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
-__contract__(
-  requires(memory_no_alias(x, sizeof(polyvec_mulcache)))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  assigns(object_whole(x))
-);
-
-#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
-/*************************************************
- * Name:        polyvec_reduce
- *
- * Description: Applies Barrett reduction to each coefficient
- *              of each element of a vector of polynomials;
- *              for details of the Barrett reduction see comments in reduce.c
- *
- * Arguments:   - polyvec *r: pointer to input/output polynomial
- **************************************************/
-/*
- * NOTE: The semantics of polyvec_reduce() is different in
- *       the reference implementation, which requires
- *       signed canonical output data. Unsigned canonical
- *       outputs are better suited to the only remaining
- *       use of poly_reduce() in the context of (de)serialization.
- */
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_reduce(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(k0, 0, MLKEM_K,
-    array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-);
-
-#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
-/*************************************************
- * Name:        polyvec_add
- *
- * Description: Add vectors of polynomials
- *
- * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be
- *              added to
- *            - const polyvec *b: pointer to second input vector of polynomials
- *
- * The coefficients of r and b must be so that the addition does
- * not overflow. Otherwise, the behaviour of this function is undefined.
- *
- * The coefficients returned in *r are in int16_t which is sufficient
- * to prove type-safety of calling units. Therefore, no stronger
- * ensures clause is required on this function.
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_add(polyvec *r, const polyvec *b)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  requires(memory_no_alias(b, sizeof(polyvec)))
-  requires(forall(j0, 0, MLKEM_K,
-          forall(k0, 0, MLKEM_N,
-            (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
-  requires(forall(j1, 0, MLKEM_K,
-          forall(k1, 0, MLKEM_N,
-            (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
-  assigns(object_whole(r))
-);
-
-#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
-/*************************************************
- * Name:        polyvec_tomont
- *
- * Description: Inplace conversion of all coefficients of a polynomial
- *              vector from normal domain to Montgomery domain
- *
- *              Bounds: Output < q in absolute value.
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tomont(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(memory_slice(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-    array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
-);
-
-#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and nonces, with output polynomials close to centered binomial distribution
- * with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-/* Depending on MLKEM_K, the pointers passed to this function belong
-   to the same objects, so we cannot use memory_no_alias for r0-r3.
-
-   NOTE: Somehow it is important to use memory_no_alias() first in the
-         conjunctions defining each case.
-*/
-#if MLKEM_K == 2
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
-    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 4
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case B: r0, r1, r2, r3 consecutive */
-    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 3
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case C: r0, r1, r2 consecutive */
- (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
-  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#endif /* MLKEM_K */
-
-#if MLKEM_ETA1 == MLKEM_ETA2
-/*
- * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
- * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
- * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
- */
-#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
-#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
-/*************************************************
- * Name:        poly_getnoise_eta2
- *
- * Description: Sample a polynomial deterministically from a seed and a nonce,
- *              with output polynomial close to centered binomial distribution
- *              with parameter MLKEM_ETA2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1122_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and a nonces, with output polynomials close to centered binomial
- * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-__contract__(
-  requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
-  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
-     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
-);
-#endif /* MLKEM_K == 2 */
-
-#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.h
deleted file mode 100644
index b432a4201..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/reduce.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "common.h"
-#include "debug.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
-#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
-#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
-#define fqmul MLKEM_NAMESPACE(fqmul)
-#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
-/* End of static namespacing */
-
-#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
-
-/*************************************************
- * Name:        cast_uint16_to_int16
- *
- * Description: Cast uint16 value to int16
- *
- * Returns:
- *   input x in     0 .. 32767: returns value unchanged
- *   input x in 32768 .. 65535: returns (x - 65536)
- **************************************************/
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
-ALWAYS_INLINE
-static INLINE int16_t cast_uint16_to_int16(uint16_t x)
-{
-  /*
-   * PORTABILITY: This relies on uint16_t -> int16_t
-   * being implemented as the inverse of int16_t -> uint16_t,
-   * which is implementation-defined (C99 6.3.1.3 (3))
-   * CBMC (correctly) fails to prove this conversion is OK,
-   * so we have to suppress that check here
-   */
-  return (int16_t)x;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/*************************************************
- * Name:        montgomery_reduce_generic
- *
- * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
- *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *
- * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
- *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
- *
- **************************************************/
-ALWAYS_INLINE
-static INLINE int16_t montgomery_reduce_generic(int32_t a)
-{
-  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
-  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
-
-  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
-  const uint16_t a_reduced = a & UINT16_MAX;
-  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
-
-  /* Lift to signed canonical representative mod 2^16. */
-  const int16_t t = cast_uint16_to_int16(a_inverted);
-
-  int32_t r = a - ((int32_t)t * MLKEM_Q);
-  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
-
-  /*
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  r = r >> 16;
-  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
-   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
-   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *
-   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
-   */
-
-  return (int16_t)r;
-}
-
-/*************************************************
- * Name:        montgomery_reduce
- *
- * Description: Montgomery reduction
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
- *
- * Returns:     integer congruent to a * R^-1 modulo q,
- *              smaller than 2 * q in absolute value.
- **************************************************/
-static INLINE int16_t montgomery_reduce(int32_t a)
-__contract__(
-  requires(a > -(2 * UINT12_LIMIT * 32768))
-  requires(a <  (2 * UINT12_LIMIT * 32768))
-  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
-
-  res = montgomery_reduce_generic(a);
-  /* Bounds:
-   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
-   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
-   *        < 2 * MLKEM_Q */
-
-  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        fqmul
- *
- * Description: Montgomery multiplication modulo q=3329
- *
- * Arguments:   - int16_t a: first factor
- *                  Can be any int16_t.
- *              - int16_t b: second factor.
- *                  Must be signed canonical (abs value <(q+1)/2)
- *
- * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
- * smaller than q in absolute value.
- *
- **************************************************/
-static INLINE int16_t fqmul(int16_t a, int16_t b)
-__contract__(
-  requires(b > -HALF_Q)
-  requires(b < HALF_Q)
-  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&b, 1, HALF_Q);
-
-  res = montgomery_reduce((int32_t)a * (int32_t)b);
-  /* Bounds:
-   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
-   *        < MLKEM_Q
-   */
-
-  debug_assert_abs_bound(&res, 1, MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        barrett_reduce
- *
- * Description: Barrett reduction; given a 16-bit integer a, computes
- *              centered representative congruent to a mod q in
- *              {-(q-1)/2,...,(q-1)/2}
- *
- * Arguments:   - int16_t a: input integer to be reduced
- *
- * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
- **************************************************/
-static INLINE int16_t barrett_reduce(int16_t a)
-__contract__(
-  ensures(return_value > -HALF_Q && return_value < HALF_Q)
-)
-{
-  /*
-   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
-   * multiplier is round_to_nearest(2**26/MLKEM_Q)
-   */
-  const int BPOWER = 26;
-  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
-
-  /*
-   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
-   * above and shift by BPOWER places.
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
-
-  /*
-   * t is in -10 .. +10, so we need 32-bit math to
-   * evaluate t * MLKEM_Q and the subsequent subtraction
-   */
-  int16_t res = (int16_t)(a - t * MLKEM_Q);
-
-  debug_assert_abs_bound(&res, 1, HALF_Q);
-  return res;
-}
-
-#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.c
deleted file mode 100644
index cbbe4407f..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-
-#include "arith_backend.h"
-#include "debug.h"
-#include "fips202.h"
-#include "fips202x4.h"
-#include "rej_uniform.h"
-#include "symmetric.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
-#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
-/* End of static namespacing */
-
-static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
-                                       unsigned int offset, const uint8_t *buf,
-                                       unsigned int buflen)
-__contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
-{
-  unsigned int ctr, pos;
-  uint16_t val0, val1;
-
-  debug_assert_bound(r, offset, 0, MLKEM_Q);
-
-  ctr = offset;
-  pos = 0;
-  /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
-  while (ctr < target && pos + 3 <= buflen)
-  __loop__(
-    invariant(offset <= ctr && ctr <= target && pos <= buflen)
-    invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q)))
-  {
-    val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
-    val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
-    pos += 3;
-
-    if (val0 < MLKEM_Q)
-    {
-      r[ctr++] = val0;
-    }
-    if (ctr < target && val1 < MLKEM_Q)
-    {
-      r[ctr++] = val1;
-    }
-  }
-
-  debug_assert_bound(r, ctr, 0, MLKEM_Q);
-  return ctr;
-}
-
-#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-/*************************************************
- * Name:        rej_uniform
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
- *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
- **************************************************/
-
-/*
- * NOTE: The signature differs from the Kyber reference implementation
- * in that it adds the offset and always expects the base of the target
- * buffer. This avoids shifting the buffer base in the caller, which appears
- * tricky to reason about.
- */
-static unsigned int rej_uniform(int16_t *r, unsigned int target,
-                                unsigned int offset, const uint8_t *buf,
-                                unsigned int buflen)
-__contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
-{
-  return rej_uniform_scalar(r, target, offset, buf, buflen);
-}
-#else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-static unsigned int rej_uniform(int16_t *r, unsigned int target,
-                                unsigned int offset, const uint8_t *buf,
-                                unsigned int buflen)
-{
-  int ret;
-
-  /* Sample from large buffer with full lane as much as possible. */
-  ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
-  if (ret != -1)
-  {
-    unsigned res = offset + (unsigned)ret;
-    debug_assert_bound(r, res, 0, MLKEM_Q);
-    return res;
-  }
-
-  return rej_uniform_scalar(r, target, offset, buf, buflen);
-}
-#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
-  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
-#endif
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
-{
-  /* Temporary buffers for XOF output before rejection sampling */
-  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-
-  /* Tracks the number of coefficients we have already sampled */
-  unsigned int ctr[KECCAK_WAY];
-  xof_x4_ctx statex;
-  unsigned int buflen;
-
-  shake128x4_inc_init(&statex);
-
-  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
-  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
-                MLKEM_SYMBYTES + 2);
-
-  /*
-   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   * This should generate the matrix entries with high probability.
-   */
-  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
-                       &statex);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
-  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
-  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
-  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
-
-  /*
-   * So long as not all matrix entries have been generated, squeeze
-   * one more block a time until we're done.
-   */
-  buflen = XOF_RATE;
-  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
-         ctr[3] < MLKEM_N)
-  __loop__(
-    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
-       object_whole(buf1), object_whole(buf2), object_whole(buf3))
-    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
-    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
-    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
-    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
-    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
-    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
-  {
-    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
-    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
-    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
-    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
-    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
-  }
-
-  xof_x4_release(&statex);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-{
-  xof_ctx state;
-  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  unsigned int ctr, buflen;
-
-  shake128_inc_init(&state);
-
-  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
-
-  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   */
-  /* This should generate the matrix entry with high probability. */
-  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
-
-  /* Squeeze + sample one more block a time until we're done */
-  buflen = XOF_RATE;
-  while (ctr < MLKEM_N)
-  __loop__(
-    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
-    invariant(ctr <= MLKEM_N)
-    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
-  {
-    xof_squeezeblocks(buf, 1, &state);
-    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
-  }
-
-  xof_release(&state);
-}
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
-int empty_cu_rej_uniform;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.h
deleted file mode 100644
index 801287259..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/rej_uniform.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef REJ_UNIFORM_H
-#define REJ_UNIFORM_H
-
-#include <stdint.h>
-#include <stdlib.h>
-#include "cbmc.h"
-#include "common.h"
-#include "poly.h"
-
-#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
-/*************************************************
- * Name:        poly_rej_uniform_x4
- *
- * Description: Generate four polynomials using rejection sampling
- *              on (pseudo-)uniformly random bytes sampled from a seed.
- *
- * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
- *                                     to be sampled.
- *              - uint8_t *seed[4]:    Pointer to array of four pointers
- *                                     pointing to the seed buffers of size
- *                                     MLKEM_SYMBYTES + 2 each.
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
-__contract__(
-  requires(memory_no_alias(vec, sizeof(poly) * 4))
-  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
-  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(vec, sizeof(poly) * 4))
-  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
-
-#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
-/*************************************************
- * Name:        poly_rej_uniform
- *
- * Description: Generate polynomial using rejection sampling
- *              on (pseudo-)uniformly random bytes sampled from a seed.
- *
- * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
- *              - uint8_t *seed:       Pointer to seed buffer of size
- *                                     MLKEM_SYMBYTES + 2 each.
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-__contract__(
-  requires(memory_no_alias(entry, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(entry, sizeof(poly)))
-  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
-
-#endif /* REJ_UNIFORM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/sampling.c
new file mode 100644
index 000000000..98cbdcb74
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/sampling.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
+#include "arith_backend.h"
+#include "debug.h"
+#include "fips202.h"
+#include "fips202x4.h"
+#include "sampling.h"
+#include "symmetric.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
+/* End of static namespacing */
+
+static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
+                                       unsigned int offset, const uint8_t *buf,
+                                       unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
+{
+  unsigned int ctr, pos;
+  uint16_t val0, val1;
+
+  debug_assert_bound(r, offset, 0, MLKEM_Q);
+
+  ctr = offset;
+  pos = 0;
+  /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
+  while (ctr < target && pos + 3 <= buflen)
+  __loop__(
+    invariant(offset <= ctr && ctr <= target && pos <= buflen)
+    invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q)))
+  {
+    val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
+    val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+    pos += 3;
+
+    if (val0 < MLKEM_Q)
+    {
+      r[ctr++] = val0;
+    }
+    if (ctr < target && val1 < MLKEM_Q)
+    {
+      r[ctr++] = val1;
+    }
+  }
+
+  debug_assert_bound(r, ctr, 0, MLKEM_Q);
+  return ctr;
+}
+
+#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * NOTE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
+{
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+{
+  int ret;
+
+  /* Sample from large buffer with full lane as much as possible. */
+  ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
+  if (ret != -1)
+  {
+    unsigned res = offset + (unsigned)ret;
+    debug_assert_bound(r, res, 0, MLKEM_Q);
+    return res;
+  }
+
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
+#endif
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  xof_x4_ctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
+                MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                       &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = XOF_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+  {
+    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  xof_x4_release(&statex);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+{
+  xof_ctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = XOF_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(ctr <= MLKEM_N)
+    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
+  {
+    xof_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
+  }
+
+  xof_release(&state);
+}
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        load32_littleendian
+ *
+ * Description: load 4 bytes into a 32-bit integer
+ *              in little-endian order
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x
+ **************************************************/
+static uint32_t load32_littleendian(const uint8_t x[4])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  r |= (uint32_t)x[3] << 24;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
+  {
+    unsigned j;
+    uint32_t t = load32_littleendian(buf + 4 * i);
+    uint32_t d = t & 0x55555555;
+    d += (t >> 1) & 0x55555555;
+
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
+    {
+      const int16_t a = (d >> (4 * j + 0)) & 0x3;
+      const int16_t b = (d >> (4 * j + 2)) & 0x3;
+      r->coeffs[8 * i + j] = a - b;
+    }
+  }
+}
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+/*************************************************
+ * Name:        load24_littleendian
+ *
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+ **************************************************/
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
+  {
+    unsigned j;
+    const uint32_t t = load24_littleendian(buf + 3 * i);
+    uint32_t d = t & 0x00249249;
+    d += (t >> 1) & 0x00249249;
+    d += (t >> 2) & 0x00249249;
+
+    for (j = 0; j < 4; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 4 && j <= 4)
+      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
+    {
+      const int16_t a = (d >> (6 * j + 0)) & 0x7;
+      const int16_t b = (d >> (6 * j + 3)) & 0x7;
+      r->coeffs[4 * i + j] = a - b;
+    }
+  }
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling)
+int empty_cu_sampling;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/sampling.h
new file mode 100644
index 000000000..cc524e0fc
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/sampling.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef SAMPLING_H
+#define SAMPLING_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "cbmc.h"
+#include "common.h"
+#include "poly.h"
+
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
+/*************************************************
+ * Name:        poly_cbd2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
+/*************************************************
+ * Name:        poly_cbd3
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
+
+#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
+/*************************************************
+ * Name:        poly_rej_uniform_x4
+ *
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
+ *                                     to be sampled.
+ *              - uint8_t *seed[4]:    Pointer to array of four pointers
+ *                                     pointing to the seed buffers of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
+/*************************************************
+ * Name:        poly_rej_uniform
+ *
+ * Description: Generate polynomial using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
+ *              - uint8_t *seed:       Pointer to seed buffer of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+__contract__(
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#endif /* SAMPLING_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c
index 4ef887c62..987f0dce4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c
@@ -10,7 +10,7 @@
 
 #include "common.h"
 #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-#include "ntt.h"
+#include "poly.h"
 
 /*
  * Table of zeta values used in the reference NTT and inverse NTT.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/api.h
deleted file mode 100644
index 792ecb8a4..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/api.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*
- * Native arithmetic interface
- *
- * This header is primarily for documentation purposes.
- * It should not be included by backend implementations.
- *
- * To ensure consistency with backends, the header will be
- * included automatically after inclusion of the active
- * backend, to ensure consistency of function signatures,
- * and run sanity checks.
- */
-#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H
-#error \
-    "The arithmetic backend API `mlkem/native/api.h` "		\
-    "should not be directly included. Please include the relevant "	\
-    "structure headers directly."
-#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
-#define MLKEM_NATIVE_ARITH_NATIVE_API_H
-
-#include <stdint.h>
-#include "poly.h"
-#include "polyvec.h"
-
-/*
- * This is the C<->native interface allowing for the drop-in of
- * native code for performance critical arithmetic components of ML-KEM.
- *
- * A _backend_ is a specific implementation of (part of) this interface.
- *
- * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and
- * implement `static inline xxx(...)` in the profile header.
- *
- * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can
- * be set if there are native implementations for all of NTT, invNTT, and
- * base multiplication, and allows the native implementation to use a
- * custom order of polynomial coefficients in NTT domain -- the use of such
- * custom order is not an implementation-detail since the public matrix
- * is generated in NTT domain. In this case, a permutation function
- * poly_permute_bitrev_to_custom() needs to be provided that permutes
- * polynomials in NTT domain from bitreversed to the custom order.
- */
-
-/*
- * Those functions are meant to be trivial wrappers around the chosen native
- * implementation. The are static inline to avoid unnecessary calls.
- * The macro before each declaration controls whether a native
- * implementation is present.
- */
-
-#if defined(MLKEM_USE_NATIVE_NTT)
-/*************************************************
- * Name:        ntt_native
- *
- * Description: Computes negacyclic number-theoretic transform (NTT) of
- *              a polynomial in place.
- *
- *              The input polynomial is assumed to be in normal order.
- *              The output polynomial is in bitreversed order, or of a
- *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *              for more information.
- *
- * Arguments:   - poly *p: pointer to in/output polynomial
- **************************************************/
-static INLINE void ntt_native(poly *);
-#endif /* MLKEM_USE_NATIVE_NTT */
-
-#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
-/*
- * This must only be set if NTT, invNTT, basemul, mulcache, and
- * to/from byte stream conversions all have native implementations
- * that are adapted to the custom order.
- */
-#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \
-    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) ||                  \
-    !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) ||  \
-    !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) ||                           \
-    !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-#error \
-    "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \
-set if there are native implementations for NTT, invNTT, mulcache, basemul, \
-and to/from bytes conversions."
-#endif
-
-/*************************************************
- * Name:        poly_permute_bitrev_to_custom
- *
- * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined,
- *              convert a polynomial in NTT domain from bitreversed
- *              order to the custom order output by the native NTT.
- *
- *              This must only be defined if there is native code for
- *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
- * Arguments:   - poly *p: pointer to in/output polynomial
- *
- **************************************************/
-static INLINE void poly_permute_bitrev_to_custom(poly *);
-#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
-
-#if defined(MLKEM_USE_NATIVE_INTT)
-/*************************************************
- * Name:        intt_native
- *
- * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
- *              of a polynomial in place.
- *
- *              The input polynomial is in bitreversed order, or of a
- *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *              for more information.
- *              The output polynomial is assumed to be in normal order.
- *
- * Arguments:   - uint16_t *a: pointer to in/output polynomial
- **************************************************/
-static INLINE void intt_native(poly *);
-#endif /* MLKEM_USE_NATIVE_INTT */
-
-#if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
-/*************************************************
- * Name:        poly_reduce_native
- *
- * Description: Applies modular reduction to all coefficients of a polynomial.
- *
- * Arguments:   - poly *r: pointer to input/output polynomial
- **************************************************/
-static INLINE void poly_reduce_native(poly *);
-#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
-
-#if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
-/*************************************************
- * Name:        poly_tomont_native
- *
- * Description: Inplace conversion of all coefficients of a polynomial
- *              from normal domain to Montgomery domain
- *
- * Arguments:   - poly *r: pointer to input/output polynomial
- **************************************************/
-static INLINE void poly_tomont_native(poly *);
-#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
-
-#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
-/*************************************************
- * Name:        poly_mulcache_compute_native
- *
- * Description: Compute multiplication cache for a polynomial
- *              in NTT domain.
- *
- *              The purpose of the multiplication cache is to
- *              cache repeated computations required during a
- *              base multiplication of polynomials in NTT domain.
- *              The structure of the multiplication-cache is
- *              implementation defined.
- *
- * Arguments:   INPUT:
- *              - poly: const pointer to input polynomial.
- *                  This must be in NTT domain and inin bitreversed order, or of
- *                  a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *                  See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *                  for more information.
- *              OUTPUT
- *              - cache: pointer to multiplication cache
- **************************************************/
-static INLINE void poly_mulcache_compute_native(poly_mulcache *cache,
-                                                const poly *poly);
-#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
-
-#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
-/*************************************************
- * Name:        poly_mulcache_compute_native
- *
- * Description: Compute multiplication of polynomials in NTT domain.
- *
- * Arguments:   INPUT:
- *              - a: First polynomial operand.
- *                 This must be in NTT domain and inin bitreversed order, or of
- *                 a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
- *                 See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
- *                 for more information.
- *              - b: Second polynomial operand.
- *                 As for a.
- *              - b_cache: Multiplication-cache for b.
- *              OUTPUT
- *              - r: Result of the base multiplication. This is again
- *                   in NTT domain, and of the same order as a and b.
- **************************************************/
-static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache);
-#endif
-
-#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
-/*************************************************
- * Name:        poly_tobytes_native
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - a: const pointer to input polynomial,
- *                with each coefficient in the range -Q+1 .. Q-1
- *              OUTPUT
- *              - r: pointer to output byte array
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a);
-#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
-
-#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-/*************************************************
- * Name:        poly_frombytes_native
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - r: pointer to output polynomial in NTT domain
- *              OUTPUT
- *              - a: const pointer to input byte aray
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-static INLINE void poly_frombytes_native(poly *a,
-                                         const uint8_t r[MLKEM_POLYBYTES]);
-#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-
-#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-/*************************************************
- * Name:        rej_uniform_native
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int len:    requested number of 16-bit integers
- *                                     (uniform mod q).
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes.
- *
- * Return -1 if the native implementation does not support the input lengths.
- * Otherwise, returns non-negative number of sampled 16-bit integers (at most
- * len).
- **************************************************/
-static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
-                                     const uint8_t *buf, unsigned int buflen);
-#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/arith_backend.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/arith_backend.h
index 0543b1bd1..ade31cda1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/arith_backend.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/arith_backend.h
@@ -17,7 +17,7 @@
  * Keep this _after_ the inclusion of the backend; otherwise,
  * the sanity checks won't have an effect. */
 #if defined(MLKEM_NATIVE_CHECK_APIS)
-#include "api.h"
+#include "native/api.h"
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.c
deleted file mode 100644
index 1e6b7c5d1..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#ifndef MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED
-
-#include <stdint.h>
-#include "cbd.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
-#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
-/* End of static namespacing */
-
-/*************************************************
- * Name:        load32_littleendian
- *
- * Description: load 4 bytes into a 32-bit integer
- *              in little-endian order
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x
- **************************************************/
-static uint32_t load32_littleendian(const uint8_t x[4])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  r |= (uint32_t)x[3] << 24;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
-  {
-    unsigned j;
-    uint32_t t = load32_littleendian(buf + 4 * i);
-    uint32_t d = t & 0x55555555;
-    d += (t >> 1) & 0x55555555;
-
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
-    {
-      const int16_t a = (d >> (4 * j + 0)) & 0x3;
-      const int16_t b = (d >> (4 * j + 2)) & 0x3;
-      r->coeffs[8 * i + j] = a - b;
-    }
-  }
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-/*************************************************
- * Name:        load24_littleendian
- *
- * Description: load 3 bytes into a 32-bit integer
- *              in little-endian order.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - const uint8_t *x: pointer to input byte array
- *
- * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
- **************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 4; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 4)
-    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
-  {
-    unsigned j;
-    const uint32_t t = load24_littleendian(buf + 3 * i);
-    uint32_t d = t & 0x00249249;
-    d += (t >> 1) & 0x00249249;
-    d += (t >> 2) & 0x00249249;
-
-    for (j = 0; j < 4; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 4 && j <= 4)
-      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
-    {
-      const int16_t a = (d >> (6 * j + 0)) & 0x7;
-      const int16_t b = (d >> (6 * j + 3)) & 0x7;
-      r->coeffs[4 * i + j] = a - b;
-    }
-  }
-}
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
-          3 */
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_cbd MLKEM_NAMESPACE_K(empty_cu_cbd)
-int empty_cu_cbd;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.h
deleted file mode 100644
index 54c1f5b90..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/cbd.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef CBD_H
-#define CBD_H
-
-#include <stdint.h>
-#include "common.h"
-#include "poly.h"
-
-#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
-/*************************************************
- * Name:        poly_cbd2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
-#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
-/*************************************************
- * Name:        poly_cbd3
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter eta=3.
- *              This function is only needed for ML-KEM-512
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
-
-#endif /* CBD_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h
index 4f326333e..62ed53ab1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h
@@ -15,12 +15,19 @@
 #include "sys.h"
 
 /* Include backend metadata */
-#if defined(MLKEM_USE_NATIVE)
-#if defined(MLKEM_NATIVE_ARITH_BACKEND)
-#include MLKEM_NATIVE_ARITH_BACKEND
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+#if defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#include MLKEM_NATIVE_ARITH_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_ARITH is set, but MLKEM_NATIVE_ARITH_BACKEND_FILE is not.
+#endif
 #endif
-#if defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#include MLKEM_NATIVE_FIPS202_BACKEND
+
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+#if defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#include MLKEM_NATIVE_FIPS202_BACKEND_FILE
+#else
+#error Bad configuration: MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, but MLKEM_NATIVE_FIPS202_BACKEND_FILE is not.
 #endif
 #endif
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/compress.c
new file mode 100644
index 000000000..a03fe0ac4
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/compress.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
+#include <stdint.h>
+#include <string.h>
+#include "arith_backend.h"
+#include "cbmc.h"
+#include "compress.h"
+#include "debug.h"
+#include "verify.h"
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 16)))
+    {
+      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
+    }
+
+    r[i * 4] = t[0] | (t[1] << 4);
+    r[i * 4 + 1] = t[2] | (t[3] << 4);
+    r[i * 4 + 2] = t[4] | (t[5] << 4);
+    r[i * 4 + 3] = t[6] | (t[7] << 4);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(invariant(j <= MLKEM_N / 4))
+  {
+    unsigned k;
+    uint16_t t[4];
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(forall(r, 0, k, t[r] < (1u << 10))))
+    {
+      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 10-bit in size.
+     */
+    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
+    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
+    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
+    r[5 * j + 4] = (t[3] >> 2);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
+  {
+    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
+    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 4; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 4)
+    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[4];
+    uint8_t const *base = &a[5 * j];
+
+    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
+    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
+    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
+
+    for (k = 0; k < 4; k++)
+    __loop__(
+      invariant(k <= 4)
+      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    uint8_t t[8] = {0};
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(t, 0, j, 0, 32)))
+    {
+      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
+    }
+
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC, and use array indexing into
+     * r rather than pointer-arithmetic to simplify verification
+     */
+    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
+    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
+    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
+{
+  unsigned j;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(invariant(j <= MLKEM_N / 8))
+  {
+    unsigned k;
+    uint16_t t[8];
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(forall(r, 0, k, t[r] < (1u << 11))))
+    {
+      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
+    }
+
+    /*
+     * Make all implicit truncation explicit. No data is being
+     * truncated for the LHS's since each t[i] is 11-bit in size.
+     */
+    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
+    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
+    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
+    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
+    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
+    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
+    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
+    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
+    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
+    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
+    r[11 * j + 10] = (t[7] >> 3);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    uint8_t t[8];
+    const unsigned offset = i * 5;
+    /*
+     * Explicitly truncate to avoid warning about
+     * implicit truncation in CBMC and unwind loop for ease
+     * of proof.
+     */
+
+    /*
+     * Decompress 5 8-bit bytes (so 40 bits) into
+     * 8 5-bit values stored in t[]
+     */
+    t[0] = 0x1F & (a[offset + 0] >> 0);
+    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
+    t[2] = 0x1F & (a[offset + 1] >> 2);
+    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
+    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
+    t[5] = 0x1F & (a[offset + 3] >> 1);
+    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
+    t[7] = 0x1F & (a[offset + 4] >> 3);
+
+    /* and copy to the correct slice in r[] */
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(j <= 8 && i <= MLKEM_N / 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+{
+  unsigned j;
+  for (j = 0; j < MLKEM_N / 8; j++)
+  __loop__(
+    invariant(j <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
+  {
+    unsigned k;
+    uint16_t t[8];
+    uint8_t const *base = &a[11 * j];
+    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
+    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
+    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
+                    ((uint16_t)base[4] << 10));
+    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
+    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
+    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
+                    ((uint16_t)base[8] << 9));
+    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
+    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
+
+    for (k = 0; k < 8; k++)
+    __loop__(
+      invariant(k <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
+    {
+      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
+    }
+  }
+
+  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+}
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(invariant(i <= MLKEM_N / 2))
+  {
+    const uint16_t t0 = a->coeffs[2 * i];
+    const uint16_t t1 = a->coeffs[2 * i + 1];
+    /*
+     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
+     * significant data, so these can be packed into 24 bits or exactly
+     * 3 bytes, as follows.
+     */
+
+    /* Least significant bits 0 - 7 of t0. */
+    r[3 * i + 0] = t0 & 0xFF;
+
+    /*
+     * Most significant bits 8 - 11 of t0 become the least significant
+     * nibble of the second byte. The least significant 4 bits
+     * of t1 become the upper nibble of the second byte.
+     */
+    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+
+    /* Bits 4 - 11 of t1 become the third byte. */
+    r[3 * i + 2] = t1 >> 4;
+  }
+}
+#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+{
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  poly_tobytes_native(r, a->coeffs);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 2; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
+  {
+    const uint8_t t0 = a[3 * i + 0];
+    const uint8_t t1 = a[3 * i + 1];
+    const uint8_t t2 = a[3 * i + 2];
+    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
+    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+  }
+
+  /* Note that the coefficients are not canonical */
+  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
+}
+#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+{
+  poly_frombytes_native(r->coeffs, a);
+}
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+{
+  unsigned i;
+#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
+#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
+#endif
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
+  {
+    unsigned j;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <  MLKEM_N / 8 && j <= 8)
+      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
+    {
+      /* Prevent the compiler from recognizing this as a bit selection */
+      uint8_t mask = value_barrier_u8(1u << j);
+      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
+    }
+  }
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
+{
+  unsigned i;
+  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(invariant(i <= MLKEM_N / 8))
+  {
+    unsigned j;
+    msg[i] = 0;
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8))
+    {
+      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
+      msg[i] |= t << j;
+    }
+  }
+}
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_compress MLKEM_NAMESPACE_K(empty_cu_compress)
+int empty_cu_compress;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/compress.h
new file mode 100644
index 000000000..409dbe519
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/compress.h
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef COMPRESS_H
+#define COMPRESS_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+#include "debug.h"
+#include "poly.h"
+#include "verify.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
+#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
+#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
+#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
+#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
+#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
+#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
+#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
+#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
+/* End of static namespacing */
+
+/************************************************************
+ * Name: scalar_compress_d1
+ *
+ * Description: Computes round(u * 2 / q)
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 1.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d1(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 2)
+  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
+{
+  uint32_t d0 = u << 1;
+  d0 *= 645083;
+  d0 += 1u << 30;
+  d0 >>= 31;
+  return d0;
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_compress_d4
+ *
+ * Description: Computes round(u * 16 / q) % 16
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d4(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 16)
+  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
+{
+  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
+  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d4
+ *
+ * Description: Computes round(u * q / 16)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 4.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d4(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 16)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 8) / 16; }
+
+/************************************************************
+ * Name: scalar_compress_d5
+ *
+ * Description: Computes round(u * 32 / q) % 32
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d5(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < 32)
+  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
+{
+  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
+  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d5
+ *
+ * Description: Computes round(u * q / 32)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 5.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 32
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d5(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 32)
+  ensures(return_value <= MLKEM_Q - 1)
+) { return ((u * MLKEM_Q) + 16) / 32; }
+
+/************************************************************
+ * Name: scalar_compress_d10
+ *
+ * Description: Computes round(u * 2**10 / q) % 2**10
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d10(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 10))
+  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
+{
+  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x3FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d10
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d10(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 1024)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 512) / 1024; }
+
+/************************************************************
+ * Name: scalar_compress_d11
+ *
+ * Description: Computes round(u * 2**11 / q) % 2**11
+ *
+ *              Implements Compress_d from FIPS203, Eq (4.7),
+ *              for d = 11.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo q
+ *                 to be compressed.
+ ************************************************************/
+/*
+ * The multiplication in this routine will exceed UINT32_MAX
+ * and wrap around for large values of u. This is expected and required.
+ */
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "unsigned-overflow"
+#endif
+static INLINE uint32_t scalar_compress_d11(uint16_t u)
+__contract__(
+  requires(u <= MLKEM_Q - 1)
+  ensures(return_value < (1u << 11))
+  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
+{
+  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
+  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
+  return (d0 & 0x7FF);
+}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+/************************************************************
+ * Name: scalar_decompress_d11
+ *
+ * Description: Computes round(u * q / 1024)
+ *
+ *              Implements Decompress_d from FIPS203, Eq (4.8),
+ *              for d = 10.
+ *
+ * Arguments: - u: Unsigned canonical modulus modulo 16
+ *                 to be decompressed.
+ ************************************************************/
+static INLINE uint16_t scalar_decompress_d11(uint32_t u)
+__contract__(
+  requires(0 <= u && u < 2048)
+  ensures(return_value <= (MLKEM_Q - 1))
+) { return ((u * MLKEM_Q) + 1024) / 2048; }
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
+    (MLKEM_K == 2 || MLKEM_K == 3)
+#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
+/*************************************************
+ * Name:        poly_compress_d4
+ *
+ * Description: Compression (4 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
+
+#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
+/*************************************************
+ * Name:        poly_compress_d10
+ *
+ * Description: Compression (10 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
+
+#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
+/*************************************************
+ * Name:        poly_decompress_d4
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+
+#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
+/*************************************************
+ * Name:        poly_decompress_d10
+ *
+ * Description: De-serialization and subsequent decompression (10 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d10
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d10(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
+          || MLKEM_K == 3) */
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
+#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
+/*************************************************
+ * Name:        poly_compress_d5
+ *
+ * Description: Compression (5 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
+
+#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
+/*************************************************
+ * Name:        poly_compress_d11
+ *
+ * Description: Compression (11 bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
+
+#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
+/*************************************************
+ * Name:        poly_decompress_d5
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+
+#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
+/*************************************************
+ * Name:        poly_decompress_d11
+ *
+ * Description: De-serialization and subsequent decompression (11 bits) of a
+ *              polynomial; approximate inverse of poly_compress_d11
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_decompress_d11(poly *r,
+                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
+        */
+
+#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
+/*************************************************
+ * Name:        poly_tobytes
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range [0,1,..,Q-1]
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYBYTES))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r))
+);
+
+
+#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
+/*************************************************
+ * Name:        poly_frombytes
+ *
+ * Description: De-serialization of a polynomial.
+ *
+ * Arguments:   INPUT
+ *              - a: pointer to input byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ *              OUTPUT
+ *              - r: pointer to output polynomial, with
+ *                   each coefficient unsigned and in the range
+ *                   0 .. 4095
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
+);
+
+
+#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
+/*************************************************
+ * Name:        poly_frommsg
+ *
+ * Description: Convert 32-byte message to polynomial
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *msg: pointer to input message
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
+#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
+/*************************************************
+ * Name:        poly_tomsg
+ *
+ * Description: Convert polynomial to 32-byte message
+ *
+ * Arguments:   - uint8_t *msg: pointer to output message
+ *              - const poly *r: pointer to input polynomial
+ *                Coefficients must be unsigned canonical
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
+__contract__(
+  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(msg))
+);
+
+#endif /* COMPRESS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h
index fa89370ce..e975ede95 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h
@@ -122,46 +122,87 @@
 /* #define MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 /******************************************************************************
- * Name:        MLKEM_USE_NATIVE
+ * Name:        MLKEM_USE_NATIVE_BACKEND_ARITH
  *
- * Description: Determines whether a native backend should
- *              be used, if available.
+ * Description: Determines whether an native arithmetic backend should be used.
+ *
+ *              The arithmetic backend covers performance critical functions
+ *              such as the number-theoretic transform (NTT).
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the arithmetic backend to be use is
+ *              determined by MLKEM_NATIVE_ARITH_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
  *
  *              This can also be set using CFLAGS.
  *
  *****************************************************************************/
-#if !defined(MLKEM_USE_NATIVE)
-/* #define MLKEM_USE_NATIVE */
+#if !defined(MLKEM_USE_NATIVE_BACKEND_ARITH)
+/* #define MLKEM_USE_NATIVE_BACKEND_ARITH */
 #endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_ARITH_BACKEND
+ * Name:        MLKEM_NATIVE_ARITH_BACKEND_FILE
  *
  * Description: The arithmetic backend to use.
  *
- *              This must be the filename of an arithmetic backend.
- *              See the existing backends for examples.
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is unset, this option
+ *              is ignored.
+ *
+ *              If MLKEM_USE_NATIVE_BACKEND_ARITH is set, this option must
+ *              either be undefined or the filename of an arithmetic backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE) && !defined(MLKEM_NATIVE_ARITH_BACKEND)
-#define MLKEM_NATIVE_ARITH_BACKEND "default.h"
-#endif /* MLKEM_NATIVE_ARITH_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_ARITH) && \
+    !defined(MLKEM_NATIVE_ARITH_BACKEND_FILE)
+#define MLKEM_NATIVE_ARITH_BACKEND_FILE "native/default.h"
+#endif
 
 /******************************************************************************
- * Name:        MLKEM_NATIVE_FIPS202_BACKEND
+ * Name:        MLKEM_USE_NATIVE_BACKEND_FIPS202
+ *
+ * Description: Determines whether an native FIPS202 backend should be used.
+ *
+ *              The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is
+ *              the performance bottleneck of SHA3 and SHAKE.
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the FIPS202 backend to be use is
+ *              determined by MLKEM_NATIVE_FIPS202_BACKEND: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if !defined(MLKEM_USE_NATIVE_BACKEND_FIPS202)
+/* #define MLKEM_USE_NATIVE_BACKEND_FIPS202 */
+#endif
+
+/******************************************************************************
+ * Name:        MLKEM_NATIVE_FIPS202_BACKEND_FILE
  *
  * Description: The FIPS-202 backend to use.
  *
- *              This must be the filename of an FIPS-202 backend.
+ *              If MLKEM_USE_NATIVE_BACKEND_FIPS202 is set, this option must
+ *              either be undefined or the filename of a FIPS202 backend.
+ *              If unset, the default backend will be used.
  *
  *              This can be set using CFLAGS.
  *
  *****************************************************************************/
-#if defined(MLKEM_USE_NATIVE_FIPS202) && !defined(MLKEM_NATIVE_FIPS202_BACKEND)
-#define MLKEM_NATIVE_FIPS202_BACKEND "native/default.h"
-#endif /* MLKEM_NATIVE_FIPS202_BACKEND */
+#if defined(MLKEM_USE_NATIVE_BACKEND_FIPS202) && \
+    !defined(MLKEM_NATIVE_FIPS202_BACKEND_FILE)
+#define MLKEM_NATIVE_FIPS202_BACKEND_FILE "fips202/native/default.h"
+#endif
 
 /*************************  Config internals  ********************************/
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/default.h
deleted file mode 100644
index d1e41c52e..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/default.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
-#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
-
-/*
- * Default arithmetic backend
- */
-#include "sys.h"
-
-#ifdef SYS_AARCH64
-/*
- * For AArch64, we currently we have one clean and one opt profile.
- * We default to the opt profile.
- *
- * In the future, this may branch further depending on the microarchitecture.
- */
-#include "aarch64/opt.h"
-#endif /* SYS_AARCH64 */
-
-#ifdef SYS_X86_64_AVX2
-/*
- * For now, there's only one x86_64 profile, based on
- * the AVX2 code from the Kyber repository.
- * https://github.com/pq-crystals/kyber
- */
-#include "x86_64/default.h"
-#endif /* SYS_X86_64 */
-
-#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.c
index 0cfcc3e9e..318d0fc77 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.c
@@ -9,11 +9,10 @@
 #include "fips202.h"
 #include "fips202x4.h"
 #include "indcpa.h"
-#include "ntt.h"
 #include "poly.h"
-#include "polyvec.h"
+#include "poly_k.h"
 #include "randombytes.h"
-#include "rej_uniform.h"
+#include "sampling.h"
 #include "symmetric.h"
 
 #include "arith_backend.h"
@@ -149,14 +148,14 @@ static void unpack_ciphertext(polyvec *b, poly *v,
 #define poly_permute_bitrev_to_custom \
   MLKEM_NAMESPACE_K(poly_permute_bitrev_to_custom)
 
-static INLINE void poly_permute_bitrev_to_custom(poly *data)
+static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
 __contract__(
   /* We don't specify that this should be a permutation, but only
    * that it does not change the bound established at the end of gen_matrix. */
-  requires(memory_no_alias(data, sizeof(poly)))
-  requires(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
+  requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
   assigns(memory_slice(data, sizeof(poly)))
-  ensures(array_bound(data->coeffs, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
+  ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
 #endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
 
 /* Not static for benchmarking */
@@ -245,7 +244,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
   {
     for (j = 0; j < MLKEM_K; j++)
     {
-      poly_permute_bitrev_to_custom(&a[i].vec[j]);
+      poly_permute_bitrev_to_custom(a[i].vec[j].coeffs);
     }
   }
 }
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.h
index 2c4fda3c4..b4d5985bf 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/indcpa.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "polyvec.h"
+#include "poly_k.h"
 
 #define gen_matrix MLKEM_NAMESPACE_K(gen_matrix)
 /*************************************************
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/api.h
new file mode 100644
index 000000000..0704f9dcd
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/api.h
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * Native arithmetic interface
+ *
+ * This header is primarily for documentation purposes.
+ * It should not be included by backend implementations.
+ *
+ * To ensure consistency with backends, the header will be
+ * included automatically after inclusion of the active
+ * backend, to ensure consistency of function signatures,
+ * and run sanity checks.
+ */
+#ifdef MLKEM_NATIVE_ARITH_NATIVE_API_H
+#error \
+    "The arithmetic backend API `mlkem/native/api.h` "		\
+    "should not be directly included. Please include the relevant "	\
+    "structure headers directly."
+#else /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
+#define MLKEM_NATIVE_ARITH_NATIVE_API_H
+
+#include <stdint.h>
+#include "../common.h"
+
+/*
+ * This is the C<->native interface allowing for the drop-in of
+ * native code for performance critical arithmetic components of ML-KEM.
+ *
+ * A _backend_ is a specific implementation of (part of) this interface.
+ *
+ * To add a function to a backend, define MLKEM_USE_NATIVE_XXX and
+ * implement `static inline xxx(...)` in the profile header.
+ *
+ * The only exception is MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER. This option can
+ * be set if there are native implementations for all of NTT, invNTT, and
+ * base multiplication, and allows the native implementation to use a
+ * custom order of polynomial coefficients in NTT domain -- the use of such
+ * custom order is not an implementation-detail since the public matrix
+ * is generated in NTT domain. In this case, a permutation function
+ * poly_permute_bitrev_to_custom() needs to be provided that permutes
+ * polynomials in NTT domain from bitreversed to the custom order.
+ */
+
+/*
+ * Those functions are meant to be trivial wrappers around the chosen native
+ * implementation. The are static inline to avoid unnecessary calls.
+ * The macro before each declaration controls whether a native
+ * implementation is present.
+ */
+
+#if defined(MLKEM_USE_NATIVE_NTT)
+/*************************************************
+ * Name:        ntt_native
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input polynomial is assumed to be in normal order.
+ *              The output polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
+ **************************************************/
+static INLINE void ntt_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
+/*
+ * This must only be set if NTT, invNTT, basemul, mulcache, and
+ * to/from byte stream conversions all have native implementations
+ * that are adapted to the custom order.
+ */
+#if !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT) || \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) ||                  \
+    !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) ||  \
+    !defined(MLKEM_USE_NATIVE_POLY_TOBYTES) ||                           \
+    !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+#error \
+    "Invalid native profile: MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER can only be \
+set if there are native implementations for NTT, invNTT, mulcache, basemul, \
+and to/from bytes conversions."
+#endif
+
+/*************************************************
+ * Name:        poly_permute_bitrev_to_custom
+ *
+ * Description: When MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is defined,
+ *              convert a polynomial in NTT domain from bitreversed
+ *              order to the custom order output by the native NTT.
+ *
+ *              This must only be defined if there is native code for
+ *              all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
+ * Arguments:   - int16_t p[MLKEM_N]: pointer to in/output polynomial
+ *
+ **************************************************/
+static INLINE void poly_permute_bitrev_to_custom(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER */
+
+#if defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        intt_native
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place.
+ *
+ *              The input polynomial is in bitreversed order, or of a
+ *              custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *              See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *              for more information.
+ *              The output polynomial is assumed to be in normal order.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+static INLINE void intt_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_INTT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/*************************************************
+ * Name:        poly_reduce_native
+ *
+ * Description: Applies modular reduction to all coefficients of a polynomial.
+ *
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_reduce_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOMONT)
+/*************************************************
+ * Name:        poly_tomont_native
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              from normal domain to Montgomery domain
+ *
+ * Arguments:   - int16_t r[MLKEM_N]: pointer to input/output polynomial
+ **************************************************/
+static INLINE void poly_tomont_native(int16_t p[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
+
+#if defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication cache for a polynomial
+ *              in NTT domain.
+ *
+ *              The purpose of the multiplication cache is to
+ *              cache repeated computations required during a
+ *              base multiplication of polynomials in NTT domain.
+ *              The structure of the multiplication-cache is
+ *              implementation defined.
+ *
+ * Arguments:   INPUT:
+ *              - poly: const pointer to input polynomial.
+ *                  This must be in NTT domain and inin bitreversed order, or of
+ *                  a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                  See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                  for more information.
+ *              OUTPUT
+ *              - cache: pointer to multiplication cache
+ **************************************************/
+static INLINE void poly_mulcache_compute_native(int16_t cache[MLKEM_N / 2],
+                                                const int16_t poly[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
+#if defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+/*************************************************
+ * Name:        poly_mulcache_compute_native
+ *
+ * Description: Compute multiplication of polynomials in NTT domain.
+ *
+ * Arguments:   INPUT:
+ *              - a: First polynomial operand.
+ *                 This must be in NTT domain and inin bitreversed order, or of
+ *                 a custom order if MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER is set.
+ *                 See the documentation of MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
+ *                 for more information.
+ *              - b: Second polynomial operand.
+ *                 As for a.
+ *              - b_cache: Multiplication-cache for b.
+ *              OUTPUT
+ *              - r: Result of the base multiplication. This is again
+ *                   in NTT domain, and of the same order as a and b.
+ **************************************************/
+static INLINE void polyvec_basemul_acc_montgomery_cached_native(
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]);
+#endif
+
+#if defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
+/*************************************************
+ * Name:        poly_tobytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - a: const pointer to input polynomial,
+ *                with each coefficient in the range -Q+1 .. Q-1
+ *              OUTPUT
+ *              - r: pointer to output byte array
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+                                       const int16_t a[MLKEM_N]);
+#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+
+#if defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
+/*************************************************
+ * Name:        poly_frombytes_native
+ *
+ * Description: Serialization of a polynomial.
+ *              Signed coefficients are converted to
+ *              unsigned form before serialization.
+ *
+ * Arguments:   INPUT:
+ *              - r: pointer to output polynomial in NTT domain
+ *              OUTPUT
+ *              - a: const pointer to input byte aray
+ *                   (of MLKEM_POLYBYTES bytes)
+ **************************************************/
+static INLINE void poly_frombytes_native(int16_t a[MLKEM_N],
+                                         const uint8_t r[MLKEM_POLYBYTES]);
+#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
+
+#if defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+/*************************************************
+ * Name:        rej_uniform_native
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int len:    requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes.
+ *
+ * Return -1 if the native implementation does not support the input lengths.
+ * Otherwise, returns non-negative number of sampled 16-bit integers (at most
+ * len).
+ **************************************************/
+static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
+                                     const uint8_t *buf, unsigned int buflen);
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#endif /* MLKEM_NATIVE_ARITH_NATIVE_API_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/default.h
new file mode 100644
index 000000000..f9fe4310a
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/default.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
+#define MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H
+
+/*
+ * Default arithmetic backend
+ */
+#include "../sys.h"
+
+#ifdef SYS_AARCH64
+/*
+ * For AArch64, we currently we have one clean and one opt profile.
+ * We default to the opt profile.
+ *
+ * In the future, this may branch further depending on the microarchitecture.
+ */
+#include "aarch64/opt.h"
+#endif /* SYS_AARCH64 */
+
+#ifdef SYS_X86_64_AVX2
+/*
+ * For now, there's only one x86_64 profile, based on
+ * the AVX2 code from the Kyber repository.
+ * https://github.com/pq-crystals/kyber
+ */
+#include "x86_64/default.h"
+#endif /* SYS_X86_64 */
+
+#endif /* MLKEM_NATIVE_ARITH_BACKEND_DEFAULT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/README.md
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/README.md
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/README.md
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/default.h
similarity index 90%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/default.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/default.h
index 592e8996d..73f53dc13 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/default.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/default.h
@@ -19,6 +19,6 @@
 /* Filename of the C backend implementation.
  * This is not inlined here because this header is included in assembly
  * files as well. */
-#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "x86_64/src/default_impl.h"
+#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/x86_64/src/default_impl.h"
 
 #endif /* MLKEM_NATIVE_ARITH_PROFILE_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/align.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/align.h
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/align.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/align.h
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/arith_native_x86_64.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/arith_native_x86_64.h
similarity index 91%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/arith_native_x86_64.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/arith_native_x86_64.h
index 25e00a930..acde977ad 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/arith_native_x86_64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/arith_native_x86_64.h
@@ -5,11 +5,10 @@
 #ifndef MLKEM_X86_64_NATIVE_H
 #define MLKEM_X86_64_NATIVE_H
 
-#include "common.h"
+#include "../../../common.h"
 
 #include <immintrin.h>
 #include <stdint.h>
-#include "polyvec.h"
 #include "consts.h"
 
 #define REJ_UNIFORM_AVX_NBLOCKS 3 /* See MLKEM_GEN_MATRIX_NBLOCKS */
@@ -44,8 +43,9 @@ void basemul_avx2(__m256i *r, const __m256i *a, const __m256i *b,
 #define polyvec_basemul_acc_montgomery_cached_avx2 \
   MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_avx2)
 void polyvec_basemul_acc_montgomery_cached_avx2(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache);
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]);
 
 #define ntttobytes_avx2 MLKEM_NAMESPACE(ntttobytes_avx2)
 void ntttobytes_avx2(uint8_t *r, const __m256i *a, const __m256i *qdata);
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S
index b97840e70..5fdc3d0a0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S
@@ -6,7 +6,7 @@
 // Implementation from Kyber reference repository
 // https://github.com/pq-crystals/kyber/blob/main/avx2
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
 #include "consts.h"
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.c
similarity index 51%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.c
index 5f9ae99c8..970938306 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.c
@@ -3,46 +3,46 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
-#include "poly.h"
-#include "polyvec.h"
-
 #include "arith_native_x86_64.h"
 #include "consts.h"
 
-static void poly_basemul_montgomery_avx2(poly *r, const poly *a, const poly *b)
+static void poly_basemul_montgomery_avx2(int16_t r[MLKEM_N],
+                                         const int16_t a[MLKEM_N],
+                                         const int16_t b[MLKEM_N])
 {
-  basemul_avx2((__m256i *)r->coeffs, (const __m256i *)a->coeffs,
-               (const __m256i *)b->coeffs, qdata.vec);
+  basemul_avx2((__m256i *)r, (const __m256i *)a, (const __m256i *)b, qdata.vec);
 }
 
 /*
  * Implementation from Kyber reference repository
  * https://github.com/pq-crystals/kyber/blob/main/avx2
  */
-static void poly_add_avx2(poly *r, const poly *a, const poly *b)
+static void poly_add_avx2(int16_t r[MLKEM_N], const int16_t a[MLKEM_N],
+                          const int16_t b[MLKEM_N])
 {
   unsigned i;
   __m256i f0, f1;
 
   for (i = 0; i < MLKEM_N; i += 16)
   {
-    f0 = _mm256_load_si256((const __m256i *)&a->coeffs[i]);
-    f1 = _mm256_load_si256((const __m256i *)&b->coeffs[i]);
+    f0 = _mm256_load_si256((const __m256i *)&a[i]);
+    f1 = _mm256_load_si256((const __m256i *)&b[i]);
     f0 = _mm256_add_epi16(f0, f1);
-    _mm256_store_si256((__m256i *)&r->coeffs[i], f0);
+    _mm256_store_si256((__m256i *)&r[i], f0);
   }
 }
 
-void polyvec_basemul_acc_montgomery_cached_avx2(poly *r, const polyvec *a,
-                                                const polyvec *b,
-                                                const polyvec_mulcache *b_cache)
+void polyvec_basemul_acc_montgomery_cached_avx2(
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)])
 {
   unsigned i;
-  poly t;
+  int16_t t[MLKEM_N] ALIGN;
 
   /* TODO: Use mulcache for AVX2. So far, it is unused. */
   ((void)b_cache);
@@ -50,11 +50,11 @@ void polyvec_basemul_acc_montgomery_cached_avx2(poly *r, const polyvec *a,
   /* Coefficient-wise bound of each basemul is 2q.
    * Since we are accumulating at most 4 times, the
    * overall bound is 8q < INT16_MAX. */
-  poly_basemul_montgomery_avx2(r, &a->vec[0], &b->vec[0]);
+  poly_basemul_montgomery_avx2(r, &a[0], &b[0]);
   for (i = 1; i < MLKEM_K; i++)
   {
-    poly_basemul_montgomery_avx2(&t, &a->vec[i], &b->vec[i]);
-    poly_add_avx2(r, r, &t);
+    poly_basemul_montgomery_avx2(t, &a[i * MLKEM_N], &b[i * MLKEM_N]);
+    poly_add_avx2(r, r, t);
   }
 }
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.c
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.c
index 86a0835ef..568752ae8 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/consts.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.c
@@ -8,7 +8,7 @@
  * https://github.com/pq-crystals/kyber/blob/main/avx2/consts.c
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.h
similarity index 97%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.h
index 00c415952..e2846b609 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/consts.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.h
@@ -11,7 +11,7 @@
 #ifndef CONSTS_H
 #define CONSTS_H
 
-#include "common.h"
+#include "../../../common.h"
 
 #define AVX2_BACKEND_DATA_OFFSET_16XQ 0
 #define AVX2_BACKEND_DATA_OFFSET_16XQINV 16
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/default_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/default_impl.h
similarity index 62%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/default_impl.h
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/default_impl.h
index 029111c17..3683361e2 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/default_impl.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/default_impl.h
@@ -12,8 +12,7 @@
 
 #include <string.h>
 
-#include "poly.h"
-#include "polyvec.h"
+#include "../../../params.h"
 #include "arith_native_x86_64.h"
 
 #define MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER
@@ -28,9 +27,9 @@
 #define MLKEM_USE_NATIVE_POLY_TOBYTES
 #define MLKEM_USE_NATIVE_POLY_FROMBYTES
 
-static INLINE void poly_permute_bitrev_to_custom(poly *data)
+static INLINE void poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
 {
-  nttunpack_avx2((__m256i *)(data->coeffs), qdata.vec);
+  nttunpack_avx2((__m256i *)(data), qdata.vec);
 }
 
 static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
@@ -45,27 +44,28 @@ static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
   return (int)rej_uniform_avx2(r, buf);
 }
 
-static INLINE void ntt_native(poly *data)
+static INLINE void ntt_native(int16_t data[MLKEM_N])
 {
   ntt_avx2((__m256i *)data, qdata.vec);
 }
 
-static INLINE void intt_native(poly *data)
+static INLINE void intt_native(int16_t data[MLKEM_N])
 {
   invntt_avx2((__m256i *)data, qdata.vec);
 }
 
-static INLINE void poly_reduce_native(poly *data)
+static INLINE void poly_reduce_native(int16_t data[MLKEM_N])
 {
-  reduce_avx2((__m256i *)data->coeffs, qdata.vec);
+  reduce_avx2((__m256i *)data, qdata.vec);
 }
 
-static INLINE void poly_tomont_native(poly *data)
+static INLINE void poly_tomont_native(int16_t data[MLKEM_N])
 {
-  tomont_avx2((__m256i *)data->coeffs, qdata.vec);
+  tomont_avx2((__m256i *)data, qdata.vec);
 }
 
-static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y)
+static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
+                                                const int16_t y[MLKEM_N])
 {
   /* AVX2 backend does not use mulcache */
   ((void)y);
@@ -73,22 +73,23 @@ static INLINE void poly_mulcache_compute_native(poly_mulcache *x, const poly *y)
 }
 
 static INLINE void polyvec_basemul_acc_montgomery_cached_native(
-    poly *r, const polyvec *a, const polyvec *b,
-    const polyvec_mulcache *b_cache)
+    int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N],
+    const int16_t b[MLKEM_K * MLKEM_N],
+    const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)])
 {
   polyvec_basemul_acc_montgomery_cached_avx2(r, a, b, b_cache);
 }
 
 static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
-                                       const poly *a)
+                                       const int16_t a[MLKEM_N])
 {
-  ntttobytes_avx2(r, (const __m256i *)a->coeffs, qdata.vec);
+  ntttobytes_avx2(r, (const __m256i *)a, qdata.vec);
 }
 
-static INLINE void poly_frombytes_native(poly *r,
+static INLINE void poly_frombytes_native(int16_t r[MLKEM_N],
                                          const uint8_t a[MLKEM_POLYBYTES])
 {
-  nttfrombytes_avx2((__m256i *)r->coeffs, a, qdata.vec);
+  nttfrombytes_avx2((__m256i *)r, a, qdata.vec);
 }
 
 #endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S
similarity index 98%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S
index 134bd4f71..3f013a5fa 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S
@@ -11,7 +11,7 @@
 //   in [0,1,...,q-1] rather than [0,1,...,q], matching the
 //   semantics of poly_reduce().
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 #include "consts.h"
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.inc b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.inc
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.inc
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.inc
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/intt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S
index 6b1d78ef2..7b1f22624 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/intt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S
@@ -9,7 +9,7 @@
  * Changes to placement of modular reductions have
  * been made to simplify reasoning of non-overflow */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/ntt.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S
index e8bf7894b..5d928b4cc 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/ntt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S
@@ -6,7 +6,7 @@
 // Implementation from Kyber reference repository
 // https://github.com/pq-crystals/kyber/blob/main/avx2
 
-#include "common.h"
+#include "../../../common.h"
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
 #include "consts.h"
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_avx2.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_avx2.c
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_avx2.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_avx2.c
index 54037a0df..adf2d338b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/rej_uniform_avx2.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_avx2.c
@@ -8,7 +8,7 @@
  * https://github.com/pq-crystals/kyber/blob/main/avx2
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_table.c
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_table.c
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_table.c
index 9bbc47146..e95fd9e79 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/rej_uniform_table.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_table.c
@@ -8,7 +8,7 @@
  *          Do not modify it directly.
  */
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S
similarity index 99%
rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.S
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S
index 5e708748a..9bcd04896 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S
@@ -6,7 +6,7 @@
 // Implementation from Kyber reference repository
 // https://github.com/pq-crystals/kyber/blob/main/avx2
 
-#include "common.h"
+#include "../../../common.h"
 
 #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT)
 
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.inc b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.inc
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.inc
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.inc
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/x86_64_zetas.i b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/x86_64_zetas.i
similarity index 100%
rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/x86_64_zetas.i
rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/x86_64_zetas.i
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.c
deleted file mode 100644
index 3651c8da9..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-
-#include <stdint.h>
-#include "arith_backend.h"
-#include "debug.h"
-#include "ntt.h"
-#include "reduce.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
-#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
-#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
-/* End of static namespacing */
-
-#if !defined(MLKEM_USE_NATIVE_NTT)
-/*
- * Computes a block CT butterflies with a fixed twiddle factor,
- * using Montgomery multiplication.
- * Parameters:
- * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
- * - root: Twiddle factor to use for the butterfly. This must be in
- *         Montgomery form and signed canonical.
- * - start: Offset to the beginning of the butterfly block
- * - len: Index difference between coefficients subject to a butterfly
- * - bound: Ghost variable describing coefficient bound: Prior to `start`,
- *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
- *          they must be bound by `bound`.
- * When this function returns, output coefficients in the index range
- * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
- * Example:
- * - start=8, len=4
- *   This would compute the following four butterflies
- *          8     --    12
- *             9    --     13
- *                10   --     14
- *                   11   --     15
- * - start=4, len=2
- *   This would compute the following two butterflies
- *          4 -- 6
- *             5 -- 7
- */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
-                                unsigned start, unsigned len, int bound)
-__contract__(
-  requires(start < MLKEM_N)
-  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
-  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
-  requires(-HALF_Q < zeta && zeta < HALF_Q)
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
-  requires(array_abs_bound(r, start, MLKEM_N, bound))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
-  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
-{
-  /* `bound` is a ghost variable only needed in the CBMC specification */
-  unsigned j;
-  ((void)bound);
-  for (j = start; j < start + len; j++)
-  __loop__(
-    invariant(start <= j && j <= start + len)
-    /*
-     * Coefficients are updated in strided pairs, so the bounds for the
-     * intermediate states alternate twice between the old and new bound
-     */
-    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j,           start + len, bound))
-    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
-  {
-    int16_t t;
-    t = fqmul(r[j + len], zeta);
-    r[j + len] = r[j] - t;
-    r[j] = r[j] + t;
-  }
-}
-
-/*
- *Compute one layer of forward NTT
- * Parameters:
- * - r: Pointer to base of polynomial
- * - len: Stride of butterflies in this layer.
- * - layer: Ghost variable indicating which layer is being applied.
- *          Must match `len` via `len == MLKEM_N >> layer`.
- * Note: `len` could be dropped and computed in the function, but
- *   we are following the structure of the reference NTT from the
- *   official Kyber implementation here, merely adding `layer` as
- *   a ghost variable for the specifications.
- */
-static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
-  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable only needed in the CBMC specification */
-  ((void)layer);
-  /* Twiddle factors for layer n start at index 2^(layer-1) */
-  k = MLKEM_N / (2 * len);
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(start < MLKEM_N + 2 * len)
-    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
-    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
-    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
-  {
-    int16_t zeta = zetas[k++];
-    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
-  }
-}
-
-/*
- * Compute full forward NTT
- * NOTE: This particular implementation satisfies a much tighter
- * bound on the output coefficients (5*q) than the contractual one (8*q),
- * but this is not needed in the calling code. Should we change the
- * base multiplication strategy to require smaller NTT output bounds,
- * the proof may need strengthening.
- */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  unsigned len, layer;
-  int16_t *r;
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  r = p->coeffs;
-
-  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
-  __loop__(
-    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
-    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
-  {
-    ntt_layer(r, len, layer);
-  }
-
-  /* Check the stronger bound */
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_NTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *p)
-{
-  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-  ntt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_NTT */
-
-#if !defined(MLKEM_USE_NATIVE_INTT)
-
-/* Compute one layer of inverse NTT */
-static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
-__contract__(
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
-  requires(len == (1 << (8 - layer)))
-  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-{
-  unsigned start, k;
-  /* `layer` is a ghost variable used only in the specification */
-  ((void)layer);
-  k = MLKEM_N / len - 1;
-  for (start = 0; start < MLKEM_N; start += 2 * len)
-  __loop__(
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    invariant(start <= MLKEM_N && k <= 127)
-    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
-    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
-  {
-    unsigned j;
-    int16_t zeta = zetas[k--];
-    for (j = start; j < start + len; j++)
-    __loop__(
-      invariant(start <= j && j <= start + len)
-      invariant(start <= MLKEM_N && k <= 127)
-      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-    {
-      int16_t t = r[j];
-      r[j] = barrett_reduce(t + r[j + len]);
-      r[j + len] = r[j + len] - t;
-      r[j + len] = fqmul(r[j + len], zeta);
-    }
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  /*
-   * Scale input polynomial to account for Montgomery factor
-   * and NTT twist. This also brings coefficients down to
-   * absolute value < MLKEM_Q.
-   */
-  unsigned j, len, layer;
-  const int16_t f = 1441;
-  int16_t *r = p->coeffs;
-
-  for (j = 0; j < MLKEM_N; j++)
-  __loop__(
-    invariant(j <= MLKEM_N)
-    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
-  {
-    r[j] = fqmul(r[j], f);
-  }
-
-  /* Run the invNTT layers */
-  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
-  __loop__(
-    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
-  {
-    invntt_layer(p->coeffs, len, layer);
-  }
-
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#else  /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *p)
-{
-  intt_native(p);
-  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
-}
-#endif /* MLKEM_USE_NATIVE_INTT */
-
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-{
-  int32_t t0, t1;
-  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
-
-  t0 = (int32_t)a[1] * b_cached;
-  t0 += (int32_t)a[0] * b[0];
-  t1 = (int32_t)a[0] * b[1];
-  t1 += (int32_t)a[1] * b[0];
-
-  /* |ti| < 2 * q * 2^15 */
-  r[0] = montgomery_reduce(t0);
-  r[1] = montgomery_reduce(t1);
-
-  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
-}
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_ntt MLKEM_NAMESPACE_K(empty_cu_ntt)
-int empty_cu_ntt;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.h
deleted file mode 100644
index 4e80d3ab3..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/ntt.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef NTT_H
-#define NTT_H
-#include "common.h"
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "poly.h"
-#include "reduce.h"
-
-#define zetas MLKEM_NAMESPACE(zetas)
-extern const int16_t zetas[128];
-
-#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
-/*************************************************
- * Name:        poly_ntt
- *
- * Description: Computes negacyclic number-theoretic transform (NTT) of
- *              a polynomial in place.
- *
- *              The input is assumed to be in normal order and
- *              coefficient-wise bound by MLKEM_Q in absolute value.
- *
- *              The output polynomial is in bitreversed order, and
- *              coefficient-wise bound by NTT_BOUND in absolute value.
- *
- *              (NOTE: Sometimes the input to the NTT is actually smaller,
- *               which gives better bounds.)
- *
- * Arguments:   - poly *p: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_ntt(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
-);
-
-#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
-/*************************************************
- * Name:        poly_invntt_tomont
- *
- * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
- *              of a polynomial in place;
- *              inputs assumed to be in bitreversed order, output in normal
- *              order
- *
- *              The input is assumed to be in bitreversed order, and can
- *              have arbitrary coefficients in int16_t.
- *
- *              The output polynomial is in normal order, and
- *              coefficient-wise bound by INVNTT_BOUND in absolute value.
- *
- * Arguments:   - uint16_t *a: pointer to in/output polynomial
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_invntt_tomont(poly *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
-);
-
-#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
-/************************************************************
- * Name: basemul_cached
- *
- * Description: Computes a representative modulo q of
- *              (a0*b0 + a1*b_cached, a0*b1 + a1*b0)/65536
- *
- *              If b_cached is b1*zeta, this represents the
- *              product of (a0 + a1*X) and (b0 + b1*X) in
- *              Fq[X]/(X^2 - zeta).
- *
- * Arguments: - r: Pointer to output polynomial
- *                   Upon return, coefficients are bound by
- *                   2*MLKEM_Q in absolute value.
- *            - a: Pointer to first input polynomial
- *                   Every coefficient must be in [0..4095]
- *            - b: Pointer to second input polynomial
- *                   Can have arbitrary int16_t coefficients
- *            - b_cached: Some precomputed value, typically derived from
- *                   b1 and a twiddle factor. Can be an arbitary int16_t.
- ************************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
-                    int16_t b_cached)
-__contract__(
-  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
-  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
-  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
-  assigns(memory_slice(r, 2 * sizeof(int16_t)))
-  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q))
-);
-
-#endif /* NTT_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/params.h
index 57ea4c8ba..7f6c12625 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/params.h
@@ -18,6 +18,7 @@
 #define MLKEM_N 256
 #define MLKEM_Q 3329
 #define UINT12_LIMIT 4096
+#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
 
 #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */
 #define MLKEM_SSBYTES 32  /* size in bytes of shared key */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.c
index 7483ebf6d..e8a2e2c6e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.c
@@ -8,388 +8,246 @@
 #include <stdint.h>
 #include <string.h>
 #include "arith_backend.h"
-#include "cbd.h"
 #include "cbmc.h"
 #include "debug.h"
 #include "fips202x4.h"
-#include "ntt.h"
 #include "poly.h"
-#include "reduce.h"
+#include "sampling.h"
 #include "symmetric.h"
 #include "verify.h"
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 16)))
-    {
-      t[j] = scalar_compress_d4(a->coeffs[8 * i + j]);
-    }
-
-    r[i * 4] = t[0] | (t[1] << 4);
-    r[i * 4 + 1] = t[2] | (t[3] << 4);
-    r[i * 4 + 2] = t[4] | (t[5] << 4);
-    r[i * 4 + 3] = t[6] | (t[7] << 4);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(invariant(j <= MLKEM_N / 4))
-  {
-    unsigned k;
-    uint16_t t[4];
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(forall(r, 0, k, t[r] < (1u << 10))))
-    {
-      t[k] = scalar_compress_d10(a->coeffs[4 * j + k]);
-    }
-
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 10-bit in size.
-     */
-    r[5 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
-    r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
-    r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
-    r[5 * j + 4] = (t[3] >> 2);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
-  {
-    r->coeffs[2 * i + 0] = scalar_decompress_d4((a[i] >> 0) & 0xF);
-    r->coeffs[2 * i + 1] = scalar_decompress_d4((a[i] >> 4) & 0xF);
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
+#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
+#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
+#define fqmul MLKEM_NAMESPACE(fqmul)
+#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
+#define basemul_cached MLKEM_NAMESPACE(basemul_cached)
+#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
+#define ntt_butterfly_block MLKEM_NAMESPACE(ntt_butterfly_block)
+#define ntt_layer MLKEM_NAMESPACE(ntt_layer)
+#define invntt_layer MLKEM_NAMESPACE(invntt_layer)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        cast_uint16_to_int16
+ *
+ * Description: Cast uint16 value to int16
+ *
+ * Returns:
+ *   input x in     0 .. 32767: returns value unchanged
+ *   input x in 32768 .. 65535: returns (x - 65536)
+ **************************************************/
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+ALWAYS_INLINE
+static INLINE int16_t cast_uint16_to_int16(uint16_t x)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 4; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 4)
-    invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[4];
-    uint8_t const *base = &a[5 * j];
-
-    t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
-    t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
-    t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
-
-    for (k = 0; k < 4; k++)
-    __loop__(
-      invariant(k <= 4)
-      invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[4 * j + k] = scalar_decompress_d10(t[k]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  /*
+   * PORTABILITY: This relies on uint16_t -> int16_t
+   * being implemented as the inverse of int16_t -> uint16_t,
+   * which is implementation-defined (C99 6.3.1.3 (3))
+   * CBMC (correctly) fails to prove this conversion is OK,
+   * so we have to suppress that check here
+   */
+  return (int16_t)x;
 }
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
 
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a)
+/*************************************************
+ * Name:        montgomery_reduce_generic
+ *
+ * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
+ *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
+ *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
+ *
+ **************************************************/
+ALWAYS_INLINE
+static INLINE int16_t montgomery_reduce_generic(int32_t a)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
+  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    uint8_t t[8] = {0};
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(t, 0, j, 0, 32)))
-    {
-      t[j] = scalar_compress_d5(a->coeffs[8 * i + j]);
-    }
+  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
+  const uint16_t a_reduced = a & UINT16_MAX;
+  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
 
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC, and use array indexing into
-     * r rather than pointer-arithmetic to simplify verification
-     */
-    r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
-    r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
-    r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
-    r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
-    r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
-  }
-}
+  /* Lift to signed canonical representative mod 2^16. */
+  const int16_t t = cast_uint16_to_int16(a_inverted);
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a)
-{
-  unsigned j;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  int32_t r = a - ((int32_t)t * MLKEM_Q);
+  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
 
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(invariant(j <= MLKEM_N / 8))
-  {
-    unsigned k;
-    uint16_t t[8];
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(forall(r, 0, k, t[r] < (1u << 11))))
-    {
-      t[k] = scalar_compress_d11(a->coeffs[8 * j + k]);
-    }
+  /*
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  r = r >> 16;
+  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
+   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
+   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *
+   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
+   */
 
-    /*
-     * Make all implicit truncation explicit. No data is being
-     * truncated for the LHS's since each t[i] is 11-bit in size.
-     */
-    r[11 * j + 0] = (t[0] >> 0) & 0xFF;
-    r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
-    r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
-    r[11 * j + 3] = (t[2] >> 2) & 0xFF;
-    r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
-    r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
-    r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
-    r[11 * j + 7] = (t[5] >> 1) & 0xFF;
-    r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
-    r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
-    r[11 * j + 10] = (t[7] >> 3);
-  }
+  return (int16_t)r;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+/*************************************************
+ * Name:        montgomery_reduce
+ *
+ * Description: Montgomery reduction
+ *
+ * Arguments:   - int32_t a: input integer to be reduced
+ *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
+ *
+ * Returns:     integer congruent to a * R^-1 modulo q,
+ *              smaller than 2 * q in absolute value.
+ **************************************************/
+static INLINE int16_t montgomery_reduce(int32_t a)
+__contract__(
+  requires(a > -(2 * UINT12_LIMIT * 32768))
+  requires(a <  (2 * UINT12_LIMIT * 32768))
+  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
+)
 {
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    uint8_t t[8];
-    const unsigned offset = i * 5;
-    /*
-     * Explicitly truncate to avoid warning about
-     * implicit truncation in CBMC and unwind loop for ease
-     * of proof.
-     */
-
-    /*
-     * Decompress 5 8-bit bytes (so 40 bits) into
-     * 8 5-bit values stored in t[]
-     */
-    t[0] = 0x1F & (a[offset + 0] >> 0);
-    t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
-    t[2] = 0x1F & (a[offset + 1] >> 2);
-    t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
-    t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
-    t[5] = 0x1F & (a[offset + 3] >> 1);
-    t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
-    t[7] = 0x1F & (a[offset + 4] >> 3);
-
-    /* and copy to the correct slice in r[] */
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(j <= 8 && i <= MLKEM_N / 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * i + j] = scalar_decompress_d5(t[j]);
-    }
-  }
-
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  int16_t res;
+  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
+
+  res = montgomery_reduce_generic(a);
+  /* Bounds:
+   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
+   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
+   *        < 2 * MLKEM_Q */
+
+  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
+  return res;
 }
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+#if !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+    !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+    !defined(MLKEM_USE_NATIVE_NTT) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        fqmul
+ *
+ * Description: Montgomery multiplication modulo q=3329
+ *
+ * Arguments:   - int16_t a: first factor
+ *                  Can be any int16_t.
+ *              - int16_t b: second factor.
+ *                  Must be signed canonical (abs value <(q+1)/2)
+ *
+ * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
+ * smaller than q in absolute value.
+ *
+ **************************************************/
+static INLINE int16_t fqmul(int16_t a, int16_t b)
+__contract__(
+  requires(b > -HALF_Q)
+  requires(b < HALF_Q)
+  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
+)
 {
-  unsigned j;
-  for (j = 0; j < MLKEM_N / 8; j++)
-  __loop__(
-    invariant(j <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
-  {
-    unsigned k;
-    uint16_t t[8];
-    uint8_t const *base = &a[11 * j];
-    t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
-    t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
-    t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
-                    ((uint16_t)base[4] << 10));
-    t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
-    t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
-    t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
-                    ((uint16_t)base[8] << 9));
-    t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
-    t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
-
-    for (k = 0; k < 8; k++)
-    __loop__(
-      invariant(k <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
-    {
-      r->coeffs[8 * j + k] = scalar_decompress_d11(t[k]);
-    }
-  }
+  int16_t res;
+  debug_assert_abs_bound(&b, 1, HALF_Q);
+
+  res = montgomery_reduce((int32_t)a * (int32_t)b);
+  /* Bounds:
+   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
+   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
+   *        < MLKEM_Q
+   */
 
-  debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+  debug_assert_abs_bound(&res, 1, MLKEM_Q);
+  return res;
 }
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD) || MLKEM_K == 4 */
-
-#if !defined(MLKEM_USE_NATIVE_POLY_TOBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_TOMONT) ||           \
+          !defined(MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
+          !defined(MLKEM_USE_NATIVE_NTT) ||                   \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+#if !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || !defined(MLKEM_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        barrett_reduce
+ *
+ * Description: Barrett reduction; given a 16-bit integer a, computes
+ *              centered representative congruent to a mod q in
+ *              {-(q-1)/2,...,(q-1)/2}
+ *
+ * Arguments:   - int16_t a: input integer to be reduced
+ *
+ * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
+ **************************************************/
+static INLINE int16_t barrett_reduce(int16_t a)
+__contract__(
+  ensures(return_value > -HALF_Q && return_value < HALF_Q)
+)
 {
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i <= MLKEM_N / 2))
-  {
-    const uint16_t t0 = a->coeffs[2 * i];
-    const uint16_t t1 = a->coeffs[2 * i + 1];
-    /*
-     * t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
-     * significant data, so these can be packed into 24 bits or exactly
-     * 3 bytes, as follows.
-     */
-
-    /* Least significant bits 0 - 7 of t0. */
-    r[3 * i + 0] = t0 & 0xFF;
-
-    /*
-     * Most significant bits 8 - 11 of t0 become the least significant
-     * nibble of the second byte. The least significant 4 bits
-     * of t1 become the upper nibble of the second byte.
-     */
-    r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+  /*
+   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
+   * multiplier is round_to_nearest(2**26/MLKEM_Q)
+   */
+  const int BPOWER = 26;
+  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
 
-    /* Bits 4 - 11 of t1 become the third byte. */
-    r[3 * i + 2] = t1 >> 4;
-  }
-}
-#else  /* MLKEM_USE_NATIVE_POLY_TOBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-{
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
-  poly_tobytes_native(r, a);
-}
-#endif /* MLKEM_USE_NATIVE_POLY_TOBYTES */
+  /*
+   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
+   * above and shift by BPOWER places.
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
 
-#if !defined(MLKEM_USE_NATIVE_POLY_FROMBYTES)
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 2)
-    invariant(array_bound(r->coeffs, 0, 2 * i, 0, UINT12_LIMIT)))
-  {
-    const uint8_t t0 = a[3 * i + 0];
-    const uint8_t t1 = a[3 * i + 1];
-    const uint8_t t2 = a[3 * i + 2];
-    r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
-    r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
-  }
+  /*
+   * t is in -10 .. +10, so we need 32-bit math to
+   * evaluate t * MLKEM_Q and the subsequent subtraction
+   */
+  int16_t res = (int16_t)(a - t * MLKEM_Q);
 
-  /* Note that the coefficients are not canonical */
-  debug_assert_bound(r, MLKEM_N, 0, UINT12_LIMIT);
-}
-#else  /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-{
-  poly_frombytes_native(r, a);
+  debug_assert_abs_bound(&res, 1, HALF_Q);
+  return res;
 }
-#endif /* MLKEM_USE_NATIVE_POLY_FROMBYTES */
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
+#endif /* !defined(MLKEM_USE_NATIVE_POLY_REDUCE) || \
+          !defined(MLKEM_USE_NATIVE_INTT) */
+
+static void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2],
+                           int16_t b_cached)
+__contract__(
+  requires(memory_no_alias(r, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(a, 2 * sizeof(int16_t)))
+  requires(memory_no_alias(b, 2 * sizeof(int16_t)))
+  requires(array_bound(a, 0, 2, 0, UINT12_LIMIT))
+  assigns(memory_slice(r, 2 * sizeof(int16_t)))
+  ensures(array_abs_bound(r, 0, 2, 2 * MLKEM_Q)))
 {
-  unsigned i;
-#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
-#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
-#endif
+  int32_t t0, t1;
+  debug_assert_bound(a, 2, 0, UINT12_LIMIT);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(
-    invariant(i <= MLKEM_N / 8)
-    invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
-  {
-    unsigned j;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <  MLKEM_N / 8 && j <= 8)
-      invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
-    {
-      /* Prevent the compiler from recognizing this as a bit selection */
-      uint8_t mask = value_barrier_u8(1u << j);
-      r->coeffs[8 * i + j] = ct_sel_int16(HALF_Q, 0, msg[i] & mask);
-    }
-  }
-  debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
-}
+  t0 = (int32_t)a[1] * b_cached;
+  t0 += (int32_t)a[0] * b[0];
+  t1 = (int32_t)a[0] * b[1];
+  t1 += (int32_t)a[1] * b[0];
 
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *a)
-{
-  unsigned i;
-  debug_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
+  /* |ti| < 2 * q * 2^15 */
+  r[0] = montgomery_reduce(t0);
+  r[1] = montgomery_reduce(t1);
 
-  for (i = 0; i < MLKEM_N / 8; i++)
-  __loop__(invariant(i <= MLKEM_N / 8))
-  {
-    unsigned j;
-    msg[i] = 0;
-    for (j = 0; j < 8; j++)
-    __loop__(
-      invariant(i <= MLKEM_N / 8 && j <= 8))
-    {
-      uint32_t t = scalar_compress_d1(a->coeffs[8 * i + j]);
-      msg[i] |= t << j;
-    }
-  }
+  debug_assert_abs_bound(r, 2, 2 * MLKEM_Q);
 }
 
 MLKEM_NATIVE_INTERNAL_API
@@ -434,12 +292,46 @@ void poly_tomont(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_tomont(poly *r)
 {
-  poly_tomont_native(r);
+  poly_tomont_native(r->coeffs);
   debug_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_TOMONT */
 
 #if !defined(MLKEM_USE_NATIVE_POLY_REDUCE)
+/************************************************************
+ * Name: scalar_signed_to_unsigned_q
+ *
+ * Description: converts signed polynomial coefficient
+ *              from signed (-3328 .. 3328) form to
+ *              unsigned form (0 .. 3328).
+ *
+ * Note: Cryptographic constant time implementation
+ *
+ * Examples:       0 -> 0
+ *                 1 -> 1
+ *              3328 -> 3328
+ *                -1 -> 3328
+ *                -2 -> 3327
+ *             -3328 -> 1
+ *
+ * Arguments: c: signed coefficient to be converted
+ ************************************************************/
+static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
+__contract__(
+  requires(c > -MLKEM_Q && c < MLKEM_Q)
+  ensures(return_value >= 0 && return_value < MLKEM_Q)
+  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
+{
+  debug_assert_abs_bound(&c, 1, MLKEM_Q);
+
+  /* Add Q if c is negative, but in constant time */
+  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
+
+  /* and therefore cast to uint16_t is safe. */
+  debug_assert_bound(&c, 1, 0, MLKEM_Q);
+  return (uint16_t)c;
+}
+
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
@@ -461,7 +353,7 @@ void poly_reduce(poly *r)
 MLKEM_NATIVE_INTERNAL_API
 void poly_reduce(poly *r)
 {
-  poly_reduce_native(r);
+  poly_reduce_native(r->coeffs);
   debug_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
 }
 #endif /* MLKEM_USE_NATIVE_POLY_REDUCE */
@@ -520,13 +412,232 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 MLKEM_NATIVE_INTERNAL_API
 void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 {
-  poly_mulcache_compute_native(x, a);
+  poly_mulcache_compute_native(x->coeffs, a->coeffs);
   /* Omitting bounds assertion since native implementations may
    * decide not to use a mulcache. Note that the C backend implementation
    * of poly_basemul_montgomery_cached() does still include the check. */
 }
 #endif /* MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 
+#if !defined(MLKEM_USE_NATIVE_NTT)
+/*
+ * Computes a block CT butterflies with a fixed twiddle factor,
+ * using Montgomery multiplication.
+ * Parameters:
+ * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
+ * - root: Twiddle factor to use for the butterfly. This must be in
+ *         Montgomery form and signed canonical.
+ * - start: Offset to the beginning of the butterfly block
+ * - len: Index difference between coefficients subject to a butterfly
+ * - bound: Ghost variable describing coefficient bound: Prior to `start`,
+ *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
+ *          they must be bound by `bound`.
+ * When this function returns, output coefficients in the index range
+ * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
+ * Example:
+ * - start=8, len=4
+ *   This would compute the following four butterflies
+ *          8     --    12
+ *             9    --     13
+ *                10   --     14
+ *                   11   --     15
+ * - start=4, len=2
+ *   This would compute the following two butterflies
+ *          4 -- 6
+ *             5 -- 7
+ */
+static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
+                                unsigned start, unsigned len, int bound)
+__contract__(
+  requires(start < MLKEM_N)
+  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
+  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
+  requires(-HALF_Q < zeta && zeta < HALF_Q)
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
+  requires(array_abs_bound(r, start, MLKEM_N, bound))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
+  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
+{
+  /* `bound` is a ghost variable only needed in the CBMC specification */
+  unsigned j;
+  ((void)bound);
+  for (j = start; j < start + len; j++)
+  __loop__(
+    invariant(start <= j && j <= start + len)
+    /*
+     * Coefficients are updated in strided pairs, so the bounds for the
+     * intermediate states alternate twice between the old and new bound
+     */
+    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j,           start + len, bound))
+    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
+    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
+  {
+    int16_t t;
+    t = fqmul(r[j + len], zeta);
+    r[j + len] = r[j] - t;
+    r[j] = r[j] + t;
+  }
+}
+
+/*
+ *Compute one layer of forward NTT
+ * Parameters:
+ * - r: Pointer to base of polynomial
+ * - len: Stride of butterflies in this layer.
+ * - layer: Ghost variable indicating which layer is being applied.
+ *          Must match `len` via `len == MLKEM_N >> layer`.
+ * Note: `len` could be dropped and computed in the function, but
+ *   we are following the structure of the reference NTT from the
+ *   official Kyber implementation here, merely adding `layer` as
+ *   a ghost variable for the specifications.
+ */
+static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
+  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable only needed in the CBMC specification */
+  ((void)layer);
+  /* Twiddle factors for layer n start at index 2^(layer-1) */
+  k = MLKEM_N / (2 * len);
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(start < MLKEM_N + 2 * len)
+    invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
+    invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
+    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
+  {
+    int16_t zeta = zetas[k++];
+    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
+  }
+}
+
+/*
+ * Compute full forward NTT
+ * NOTE: This particular implementation satisfies a much tighter
+ * bound on the output coefficients (5*q) than the contractual one (8*q),
+ * but this is not needed in the calling code. Should we change the
+ * base multiplication strategy to require smaller NTT output bounds,
+ * the proof may need strengthening.
+ */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  unsigned len, layer;
+  int16_t *r;
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  r = p->coeffs;
+
+  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
+  __loop__(
+    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
+    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
+  {
+    ntt_layer(r, len, layer);
+  }
+
+  /* Check the stronger bound */
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_NTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *p)
+{
+  debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+  ntt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, NTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_NTT */
+
+#if !defined(MLKEM_USE_NATIVE_INTT)
+
+/* Compute one layer of inverse NTT */
+static void invntt_layer(int16_t *r, unsigned len, unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(2 <= len && len <= 128 && 1 <= layer && layer <= 7)
+  requires(len == (1 << (8 - layer)))
+  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+{
+  unsigned start, k;
+  /* `layer` is a ghost variable used only in the specification */
+  ((void)layer);
+  k = MLKEM_N / len - 1;
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  __loop__(
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+    invariant(start <= MLKEM_N && k <= 127)
+    /* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
+    invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
+  {
+    unsigned j;
+    int16_t zeta = zetas[k--];
+    for (j = start; j < start + len; j++)
+    __loop__(
+      invariant(start <= j && j <= start + len)
+      invariant(start <= MLKEM_N && k <= 127)
+      invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+    {
+      int16_t t = r[j];
+      r[j] = barrett_reduce(t + r[j + len]);
+      r[j + len] = r[j + len] - t;
+      r[j + len] = fqmul(r[j + len], zeta);
+    }
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  /*
+   * Scale input polynomial to account for Montgomery factor
+   * and NTT twist. This also brings coefficients down to
+   * absolute value < MLKEM_Q.
+   */
+  unsigned j, len, layer;
+  const int16_t f = 1441;
+  int16_t *r = p->coeffs;
+
+  for (j = 0; j < MLKEM_N; j++)
+  __loop__(
+    invariant(j <= MLKEM_N)
+    invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
+  {
+    r[j] = fqmul(r[j], f);
+  }
+
+  /* Run the invNTT layers */
+  for (len = 2, layer = 7; len <= 128; len <<= 1, layer--)
+  __loop__(
+    invariant(2 <= len && len <= 256 && layer <= 7 && len == (1 << (8 - layer)))
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+  {
+    invntt_layer(p->coeffs, len, layer);
+  }
+
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#else  /* MLKEM_USE_NATIVE_INTT */
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *p)
+{
+  intt_native(p->coeffs);
+  debug_assert_abs_bound(p, MLKEM_N, INVNTT_BOUND);
+}
+#endif /* MLKEM_USE_NATIVE_INTT */
+
 #else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
 
 #define empty_cu_poly MLKEM_NAMESPACE_K(empty_cu_poly)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.h
index 6a14c785d..cb0d67c1a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly.h
@@ -9,7 +9,7 @@
 #include <stdint.h>
 #include "cbmc.h"
 #include "common.h"
-#include "reduce.h"
+#include "debug.h"
 #include "verify.h"
 
 /* Absolute exclusive upper bound for the output of the inverse NTT */
@@ -18,6 +18,9 @@
 /* Absolute exclusive upper bound for the output of the forward NTT */
 #define NTT_BOUND (8 * MLKEM_Q)
 
+#define zetas MLKEM_NAMESPACE(zetas)
+extern const int16_t zetas[128];
+
 /*
  * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
  * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
@@ -38,520 +41,6 @@ typedef struct
   int16_t coeffs[MLKEM_N >> 1];
 } poly_mulcache;
 
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define scalar_compress_d1 MLKEM_NAMESPACE(scalar_compress_d1)
-#define scalar_compress_d4 MLKEM_NAMESPACE(scalar_compress_d4)
-#define scalar_compress_d5 MLKEM_NAMESPACE(scalar_compress_d5)
-#define scalar_compress_d10 MLKEM_NAMESPACE(scalar_compress_d10)
-#define scalar_compress_d11 MLKEM_NAMESPACE(scalar_compress_d11)
-#define scalar_decompress_d4 MLKEM_NAMESPACE(scalar_decompress_d4)
-#define scalar_decompress_d5 MLKEM_NAMESPACE(scalar_decompress_d5)
-#define scalar_decompress_d10 MLKEM_NAMESPACE(scalar_decompress_d10)
-#define scalar_decompress_d11 MLKEM_NAMESPACE(scalar_decompress_d11)
-#define scalar_signed_to_unsigned_q MLKEM_NAMESPACE(scalar_signed_to_unsigned_q)
-/* End of static namespacing */
-
-/************************************************************
- * Name: scalar_compress_d1
- *
- * Description: Computes round(u * 2 / q)
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 1.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d1(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 2)
-  ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2)  )
-{
-  uint32_t d0 = u << 1;
-  d0 *= 645083;
-  d0 += 1u << 30;
-  d0 >>= 31;
-  return d0;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_compress_d4
- *
- * Description: Computes round(u * 16 / q) % 16
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d4(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 16)
-  ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
-{
-  uint32_t d0 = (uint32_t)u * 1290160; /* 16 * round(2^28 / MLKEM_Q) */
-  return (d0 + (1u << 27)) >> 28;      /* round(d0/2^28) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d4
- *
- * Description: Computes round(u * q / 16)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 4.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d4(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 16)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 8) / 16; }
-
-/************************************************************
- * Name: scalar_compress_d5
- *
- * Description: Computes round(u * 32 / q) % 32
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d5(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < 32)
-  ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32)  )
-{
-  uint32_t d0 = (uint32_t)u * 1290176; /* 2^5 * round(2^27 / MLKEM_Q) */
-  return (d0 + (1u << 26)) >> 27;      /* round(d0/2^27) */
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d5
- *
- * Description: Computes round(u * q / 32)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 5.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 32
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d5(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 32)
-  ensures(return_value <= MLKEM_Q - 1)
-) { return ((u * MLKEM_Q) + 16) / 32; }
-
-/************************************************************
- * Name: scalar_compress_d10
- *
- * Description: Computes round(u * 2**10 / q) % 2**10
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d10(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 10))
-  ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
-{
-  uint64_t d0 = (uint64_t)u * 2642263040; /* 2^10 * round(2^32 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x3FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d10
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d10(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 1024)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 512) / 1024; }
-
-/************************************************************
- * Name: scalar_compress_d11
- *
- * Description: Computes round(u * 2**11 / q) % 2**11
- *
- *              Implements Compress_d from FIPS203, Eq (4.7),
- *              for d = 11.
- *
- * Arguments: - u: Unsigned canonical modulus modulo q
- *                 to be compressed.
- ************************************************************/
-/*
- * The multiplication in this routine will exceed UINT32_MAX
- * and wrap around for large values of u. This is expected and required.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
-#endif
-static INLINE uint32_t scalar_compress_d11(uint16_t u)
-__contract__(
-  requires(u <= MLKEM_Q - 1)
-  ensures(return_value < (1u << 11))
-  ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
-{
-  uint64_t d0 = (uint64_t)u * 5284526080; /* 2^11 * round(2^33 / MLKEM_Q) */
-  d0 = (d0 + ((uint64_t)1u << 32)) >> 33;
-  return (d0 & 0x7FF);
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/************************************************************
- * Name: scalar_decompress_d11
- *
- * Description: Computes round(u * q / 1024)
- *
- *              Implements Decompress_d from FIPS203, Eq (4.8),
- *              for d = 10.
- *
- * Arguments: - u: Unsigned canonical modulus modulo 16
- *                 to be decompressed.
- ************************************************************/
-static INLINE uint16_t scalar_decompress_d11(uint32_t u)
-__contract__(
-  requires(0 <= u && u < 2048)
-  ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 1024) / 2048; }
-
-/************************************************************
- * Name: scalar_signed_to_unsigned_q
- *
- * Description: converts signed polynomial coefficient
- *              from signed (-3328 .. 3328) form to
- *              unsigned form (0 .. 3328).
- *
- * Note: Cryptographic constant time implementation
- *
- * Examples:       0 -> 0
- *                 1 -> 1
- *              3328 -> 3328
- *                -1 -> 3328
- *                -2 -> 3327
- *             -3328 -> 1
- *
- * Arguments: c: signed coefficient to be converted
- ************************************************************/
-static INLINE uint16_t scalar_signed_to_unsigned_q(int16_t c)
-__contract__(
-  requires(c > -MLKEM_Q && c < MLKEM_Q)
-  ensures(return_value >= 0 && return_value < MLKEM_Q)
-  ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
-{
-  debug_assert_abs_bound(&c, 1, MLKEM_Q);
-
-  /* Add Q if c is negative, but in constant time */
-  c = ct_sel_int16(c + MLKEM_Q, c, ct_cmask_neg_i16(c));
-
-  /* and therefore cast to uint16_t is safe. */
-  debug_assert_bound(&c, 1, 0, MLKEM_Q);
-  return (uint16_t)c;
-}
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || \
-    (MLKEM_K == 2 || MLKEM_K == 3)
-#define poly_compress_d4 MLKEM_NAMESPACE(poly_compress_d4)
-/*************************************************
- * Name:        poly_compress_d4
- *
- * Description: Compression (4 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const poly *a);
-
-#define poly_compress_d10 MLKEM_NAMESPACE(poly_compress_d10)
-/*************************************************
- * Name:        poly_compress_d10
- *
- * Description: Compression (10 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const poly *a);
-
-#define poly_decompress_d4 MLKEM_NAMESPACE(poly_decompress_d4)
-/*************************************************
- * Name:        poly_decompress_d4
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d4(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
-
-#define poly_decompress_d10 MLKEM_NAMESPACE(poly_decompress_d10)
-/*************************************************
- * Name:        poly_decompress_d10
- *
- * Description: De-serialization and subsequent decompression (10 bits) of a
- *              polynomial; approximate inverse of poly_compress_d10
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d10(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || (MLKEM_K == 2 \
-          || MLKEM_K == 3) */
-
-#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4
-#define poly_compress_d5 MLKEM_NAMESPACE(poly_compress_d5)
-/*************************************************
- * Name:        poly_compress_d5
- *
- * Description: Compression (5 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const poly *a);
-
-#define poly_compress_d11 MLKEM_NAMESPACE(poly_compress_d11)
-/*************************************************
- * Name:        poly_compress_d11
- *
- * Description: Compression (11 bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const poly *a);
-
-#define poly_decompress_d5 MLKEM_NAMESPACE(poly_decompress_d5)
-/*************************************************
- * Name:        poly_decompress_d5
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d5(poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
-
-#define poly_decompress_d11 MLKEM_NAMESPACE(poly_decompress_d11)
-/*************************************************
- * Name:        poly_decompress_d11
- *
- * Description: De-serialization and subsequent decompression (11 bits) of a
- *              polynomial; approximate inverse of poly_compress_d11
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_decompress_d11(poly *r,
-                         const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
-#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 4 \
-        */
-
-#define poly_tobytes MLKEM_NAMESPACE(poly_tobytes)
-/*************************************************
- * Name:        poly_tobytes
- *
- * Description: Serialization of a polynomial.
- *              Signed coefficients are converted to
- *              unsigned form before serialization.
- *
- * Arguments:   INPUT:
- *              - a: const pointer to input polynomial,
- *                with each coefficient in the range [0,1,..,Q-1]
- *              OUTPUT
- *              - r: pointer to output byte array
- *                   (of MLKEM_POLYBYTES bytes)
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYBYTES))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r))
-);
-
-
-#define poly_frombytes MLKEM_NAMESPACE(poly_frombytes)
-/*************************************************
- * Name:        poly_frombytes
- *
- * Description: De-serialization of a polynomial.
- *
- * Arguments:   INPUT
- *              - a: pointer to input byte array
- *                   (of MLKEM_POLYBYTES bytes)
- *              OUTPUT
- *              - r: pointer to output polynomial, with
- *                   each coefficient unsigned and in the range
- *                   0 .. 4095
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, UINT12_LIMIT))
-);
-
-
-#define poly_frommsg MLKEM_NAMESPACE(poly_frommsg)
-/*************************************************
- * Name:        poly_frommsg
- *
- * Description: Convert 32-byte message to polynomial
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *msg: pointer to input message
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_frommsg(poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-);
-
-#define poly_tomsg MLKEM_NAMESPACE(poly_tomsg)
-/*************************************************
- * Name:        poly_tomsg
- *
- * Description: Convert polynomial to 32-byte message
- *
- * Arguments:   - uint8_t *msg: pointer to output message
- *              - const poly *r: pointer to input polynomial
- *                Coefficients must be unsigned canonical
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const poly *r)
-__contract__(
-  requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(msg))
-);
-
 #define poly_basemul_montgomery_cached \
   MLKEM_NAMESPACE(poly_basemul_montgomery_cached)
 /*************************************************
@@ -715,4 +204,56 @@ __contract__(
   assigns(object_whole(r))
 );
 
+#define poly_ntt MLKEM_NAMESPACE(poly_ntt)
+/*************************************************
+ * Name:        poly_ntt
+ *
+ * Description: Computes negacyclic number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ *              (NOTE: Sometimes the input to the NTT is actually smaller,
+ *               which gives better bounds.)
+ *
+ * Arguments:   - poly *p: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_ntt(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, NTT_BOUND))
+);
+
+#define poly_invntt_tomont MLKEM_NAMESPACE(poly_invntt_tomont)
+/*************************************************
+ * Name:        poly_invntt_tomont
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+ *              of a polynomial in place;
+ *              inputs assumed to be in bitreversed order, output in normal
+ *              order
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ * Arguments:   - uint16_t *a: pointer to in/output polynomial
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_invntt_tomont(poly *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INVNTT_BOUND))
+);
+
 #endif /* POLY_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly_k.c
new file mode 100644
index 000000000..c2d330ea9
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly_k.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "poly_k.h"
+#include <stdint.h>
+#include <string.h>
+#include "arith_backend.h"
+#include "compress.h"
+#include "sampling.h"
+#include "symmetric.h"
+
+#include "debug.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
+#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
+/* End of static namespacing */
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+{
+  unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
+  }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+{
+  unsigned i;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
+  }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_ntt(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_ntt(&r->vec[i]);
+  }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_invntt_tomont(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_invntt_tomont(&r->vec[i]);
+  }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
+}
+
+#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  unsigned i;
+  poly t;
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+
+  poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
+  for (i = 1; i < MLKEM_K; i++)
+  {
+    poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
+                                   &b_cache->vec[i]);
+    poly_add(r, &t);
+  }
+
+  /*
+   * This bound is true for the C implementation, but not needed
+   * in the higher level bounds reasoning. It is thus omitted
+   * them from the spec to not unnecessarily constrain native
+   * implementations, but checked here nonetheless.
+   */
+  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
+}
+#else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+{
+  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
+  /* Omitting bounds assertion for cache since native implementations may
+   * decide not to use a mulcache. Note that the C backend implementation
+   * of poly_basemul_montgomery_cached() does still include the check. */
+  polyvec_basemul_acc_montgomery_cached_native(r->coeffs, (const int16_t *)a,
+                                               (const int16_t *)b,
+                                               (const int16_t *)b_cache);
+}
+#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
+{
+  polyvec_mulcache b_cache;
+  polyvec_mulcache_compute(&b_cache, b);
+  polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_mulcache_compute(&x->vec[i], &a->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_reduce(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_reduce(&r->vec[i]);
+  }
+
+  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_add(polyvec *r, const polyvec *b)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_add(&r->vec[i], &b->vec[i]);
+  }
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tomont(polyvec *r)
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_K; i++)
+  {
+    poly_tomont(&r->vec[i]);
+  }
+
+  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+}
+
+
+/*************************************************
+ * Name:        poly_cbd_eta1
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta1(poly *r,
+                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
+)
+{
+#if MLKEM_ETA1 == 2
+  poly_cbd2(r, buf);
+#elif MLKEM_ETA1 == 3
+  poly_cbd3(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA1"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+{
+  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
+  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
+  memcpy(extkey0, seed, MLKEM_SYMBYTES);
+  memcpy(extkey1, seed, MLKEM_SYMBYTES);
+  memcpy(extkey2, seed, MLKEM_SYMBYTES);
+  memcpy(extkey3, seed, MLKEM_SYMBYTES);
+  extkey0[MLKEM_SYMBYTES] = nonce0;
+  extkey1[MLKEM_SYMBYTES] = nonce1;
+  extkey2[MLKEM_SYMBYTES] = nonce2;
+  extkey3[MLKEM_SYMBYTES] = nonce3;
+  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
+  poly_cbd_eta1(r0, buf0);
+  poly_cbd_eta1(r1, buf1);
+  poly_cbd_eta1(r2, buf2);
+  poly_cbd_eta1(r3, buf3);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+}
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+/*************************************************
+ * Name:        poly_cbd_eta2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter MLKEM_ETA2.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+static INLINE void poly_cbd_eta2(poly *r,
+                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
+{
+#if MLKEM_ETA2 == 2
+  poly_cbd2(r, buf);
+#else
+#error "Invalid value of MLKEM_ETA2"
+#endif
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+{
+  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
+
+  memcpy(extkey, seed, MLKEM_SYMBYTES);
+  extkey[MLKEM_SYMBYTES] = nonce;
+  prf_eta2(buf, extkey);
+
+  poly_cbd_eta2(r, buf);
+
+  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+}
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+
+#if MLKEM_K == 2
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+{
+  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
+  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
+  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
+  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+  extkey[0][MLKEM_SYMBYTES] = nonce0;
+  extkey[1][MLKEM_SYMBYTES] = nonce1;
+  extkey[2][MLKEM_SYMBYTES] = nonce2;
+  extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+  prf_eta1(buf1[0], extkey[0]);
+  prf_eta1(buf1[1], extkey[1]);
+  prf_eta2(buf2[0], extkey[2]);
+  prf_eta2(buf2[1], extkey[3]);
+
+  poly_cbd_eta1(r0, buf1[0]);
+  poly_cbd_eta1(r1, buf1[1]);
+  poly_cbd_eta2(r2, buf2[0]);
+  poly_cbd_eta2(r3, buf2[1]);
+
+  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
+  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
+  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
+}
+#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly_k.h
new file mode 100644
index 000000000..0aea95912
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/poly_k.h
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef POLY_K_H
+#define POLY_K_H
+
+#include <stdint.h>
+#include "common.h"
+#include "compress.h"
+#include "poly.h"
+
+#define polyvec MLKEM_NAMESPACE_K(polyvec)
+typedef struct
+{
+  poly vec[MLKEM_K];
+} ALIGN polyvec;
+
+#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
+typedef struct
+{
+  poly_mulcache vec[MLKEM_K];
+} polyvec_mulcache;
+
+#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
+/*************************************************
+ * Name:        poly_compress_du
+ *
+ * Description: Compression (du bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
+{
+#if MLKEM_DU == 10
+  poly_compress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_compress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
+/*************************************************
+ * Name:        poly_decompress_du
+ *
+ * Description: De-serialization and subsequent decompression (du bits) of a
+ *              polynomial; approximate inverse of poly_compress_du
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_du(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(memory_slice(r, sizeof(poly)))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DU == 10
+  poly_decompress_d10(r, a);
+#elif MLKEM_DU == 11
+  poly_decompress_d11(r, a);
+#else
+#error "Invalid value of MLKEM_DU"
+#endif
+}
+
+#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
+/*************************************************
+ * Name:        poly_compress_dv
+ *
+ * Description: Compression (dv bits) and subsequent serialization of a
+ *              polynomial
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *              - const poly *a: pointer to input polynomial
+ *                  Coefficients must be unsigned canonical,
+ *                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
+                                    const poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  assigns(object_whole(r)))
+{
+#if MLKEM_DV == 4
+  poly_compress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_compress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+
+#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
+/*************************************************
+ * Name:        poly_decompress_dv
+ *
+ * Description: De-serialization and subsequent decompression (dv bits) of a
+ *              polynomial; approximate inverse of poly_compress
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *a: pointer to input byte array
+ *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
+ *
+ * Upon return, the coefficients of the output polynomial are unsigned-canonical
+ * (non-negative and smaller than MLKEM_Q).
+ *
+ **************************************************/
+static INLINE void poly_decompress_dv(
+    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
+  requires(memory_no_alias(r, sizeof(poly)))
+  assigns(object_whole(r))
+  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+{
+#if MLKEM_DV == 4
+  poly_decompress_d4(r, a);
+#elif MLKEM_DV == 5
+  poly_decompress_d5(r, a);
+#else
+#error "Invalid value of MLKEM_DV"
+#endif
+}
+
+#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
+/*************************************************
+ * Name:        polyvec_compress_du
+ *
+ * Description: Compress and serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ *              - const polyvec *a: pointer to input vector of polynomials.
+ *                                  Coefficients must be unsigned canonical,
+ *                                  i.e. in [0,1,..,MLKEM_Q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
+                         const polyvec *a)
+__contract__(
+  requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(forall(k0, 0, MLKEM_K,
+         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
+/*************************************************
+ * Name:        polyvec_decompress_du
+ *
+ * Description: De-serialize and decompress vector of polynomials;
+ *              approximate inverse of polyvec_compress_du
+ *
+ * Arguments:   - polyvec *r:       pointer to output vector of polynomials.
+ *                Output will have coefficients normalized to [0,..,q-1].
+ *              - const uint8_t *a: pointer to input byte array
+ *                                  (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU)
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_decompress_du(polyvec *r,
+                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
+__contract__(
+  requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(k0, 0, MLKEM_K,
+         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+);
+
+#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
+/*************************************************
+ * Name:        polyvec_tobytes
+ *
+ * Description: Serialize vector of polynomials
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *                            (needs space for MLKEM_POLYVECBYTES)
+ *              - const polyvec *a: pointer to input vector of polynomials
+ *                  Each polynomial must have coefficients in [0,..,q-1].
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
+__contract__(
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
+  requires(forall(k0, 0, MLKEM_K,
+         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
+/*************************************************
+ * Name:        polyvec_frombytes
+ *
+ * Description: De-serialize vector of polynomials;
+ *              inverse of polyvec_tobytes
+ *
+ * Arguments:   - const polyvec *a: pointer to output vector of polynomials
+ *                 (of length MLKEM_POLYVECBYTES). Output will have coefficients
+ *                 normalized in [0..4095].
+ *              - uint8_t *r: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
+  assigns(object_whole(r))
+  ensures(forall(k0, 0, MLKEM_K,
+        array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
+);
+
+#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
+/*************************************************
+ * Name:        polyvec_ntt
+ *
+ * Description: Apply forward NTT to all elements of a vector of polynomials.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLKEM_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by NTT_BOUND in absolute value.
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_ntt(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+  assigns(object_whole(r))
+  ensures(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
+);
+
+#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
+/*************************************************
+ * Name:        polyvec_invntt_tomont
+ *
+ * Description: Apply inverse NTT to all elements of a vector of polynomials
+ *              and multiply by Montgomery factor 2^16
+ *
+ *              The input is assumed to be in bitreversed order, and can
+ *              have arbitrary coefficients in int16_t.
+ *
+ *              The output polynomial is in normal order, and
+ *              coefficient-wise bound by INVNTT_BOUND in absolute value.
+ *
+ *
+ * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_invntt_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(j, 0, MLKEM_K,
+  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND)))
+);
+
+#define polyvec_basemul_acc_montgomery \
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery
+ *
+ * Description: Multiply elements of a and b in NTT domain, accumulate into r,
+ *              and multiply by 2^-16.
+ *
+ * Arguments: - poly *r: pointer to output polynomial
+ *            - const polyvec *a: pointer to first input vector of polynomials
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(forall(k1, 0, MLKEM_K,
+    array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+
+#define polyvec_basemul_acc_montgomery_cached \
+  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
+/*************************************************
+ * Name:        polyvec_basemul_acc_montgomery_cached
+ *
+ * Description: Scalar product of two vectors of polynomials in NTT domain,
+ *              using mulcache for second operand.
+ *
+ *              Bounds:
+ *              - Every coefficient of a is assumed to be in [0..4095]
+ *              - No bounds guarantees for the coefficients in the result.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const polyvec *a: pointer to first input polynomial vector
+ *              - const polyvec *b: pointer to second input polynomial vector
+ *              - const polyvec_mulcache *b_cache: pointer to mulcache
+ *                  for second input polynomial vector. Can be computed
+ *                  via polyvec_mulcache_compute().
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
+                                           const polyvec *b,
+                                           const polyvec_mulcache *b_cache)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache)))
+  requires(forall(k1, 0, MLKEM_K,
+     array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
+  assigns(memory_slice(r, sizeof(poly)))
+);
+
+#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
+/************************************************************
+ * Name: polyvec_mulcache_compute
+ *
+ * Description: Computes the mulcache for a vector of polynomials in NTT domain
+ *
+ *              The mulcache of a degree-2 polynomial b := b0 + b1*X
+ *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
+ *              computing products of b in Fq[X]/(X^2-zeta).
+ *
+ *              The mulcache of a polynomial in NTT domain -- which is
+ *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
+ *              for varying zeta, is the 128-tuple of mulcaches of those
+ *              polynomials.
+ *
+ *              The mulcache of a vector of polynomials is the vector
+ *              of mulcaches of its entries.
+ *
+ * Arguments: - x: Pointer to mulcache to be populated
+ *            - a: Pointer to input polynomial vector
+ ************************************************************/
+/*
+ * NOTE: The default C implementation of this function populates
+ * the mulcache with values in (-q,q), but this is not needed for the
+ * higher level safety proofs, and thus not part of the spec.
+ */
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
+__contract__(
+  requires(memory_no_alias(x, sizeof(polyvec_mulcache)))
+  requires(memory_no_alias(a, sizeof(polyvec)))
+  assigns(object_whole(x))
+);
+
+#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
+/*************************************************
+ * Name:        polyvec_reduce
+ *
+ * Description: Applies Barrett reduction to each coefficient
+ *              of each element of a vector of polynomials;
+ *              for details of the Barrett reduction see comments in reduce.c
+ *
+ * Arguments:   - polyvec *r: pointer to input/output polynomial
+ **************************************************/
+/*
+ * NOTE: The semantics of polyvec_reduce() is different in
+ *       the reference implementation, which requires
+ *       signed canonical output data. Unsigned canonical
+ *       outputs are better suited to the only remaining
+ *       use of poly_reduce() in the context of (de)serialization.
+ */
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_reduce(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(k0, 0, MLKEM_K,
+    array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+);
+
+#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
+/*************************************************
+ * Name:        polyvec_add
+ *
+ * Description: Add vectors of polynomials
+ *
+ * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be
+ *              added to
+ *            - const polyvec *b: pointer to second input vector of polynomials
+ *
+ * The coefficients of r and b must be so that the addition does
+ * not overflow. Otherwise, the behaviour of this function is undefined.
+ *
+ * The coefficients returned in *r are in int16_t which is sufficient
+ * to prove type-safety of calling units. Therefore, no stronger
+ * ensures clause is required on this function.
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_add(polyvec *r, const polyvec *b)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  requires(memory_no_alias(b, sizeof(polyvec)))
+  requires(forall(j0, 0, MLKEM_K,
+          forall(k0, 0, MLKEM_N,
+            (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
+  requires(forall(j1, 0, MLKEM_K,
+          forall(k1, 0, MLKEM_N,
+            (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
+  assigns(object_whole(r))
+);
+
+#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
+/*************************************************
+ * Name:        polyvec_tomont
+ *
+ * Description: Inplace conversion of all coefficients of a polynomial
+ *              vector from normal domain to Montgomery domain
+ *
+ *              Bounds: Output < q in absolute value.
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void polyvec_tomont(polyvec *r)
+__contract__(
+  requires(memory_no_alias(r, sizeof(polyvec)))
+  assigns(memory_slice(r, sizeof(polyvec)))
+  assigns(object_whole(r))
+  ensures(forall(j, 0, MLKEM_K,
+    array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+);
+
+#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and nonces, with output polynomials close to centered binomial distribution
+ * with parameter MLKEM_ETA1.
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
+                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
+/* Depending on MLKEM_K, the pointers passed to this function belong
+   to the same objects, so we cannot use memory_no_alias for r0-r3.
+
+   NOTE: Somehow it is important to use memory_no_alias() first in the
+         conjunctions defining each case.
+*/
+#if MLKEM_K == 2
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
+    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 4
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case B: r0, r1, r2, r3 consecutive */
+    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#elif MLKEM_K == 3
+__contract__(
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  requires( /* Case C: r0, r1, r2 consecutive */
+ (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
+  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
+  assigns(memory_slice(r0, sizeof(poly)))
+  assigns(memory_slice(r1, sizeof(poly)))
+  assigns(memory_slice(r2, sizeof(poly)))
+  assigns(memory_slice(r3, sizeof(poly)))
+  ensures(
+    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+);
+#endif /* MLKEM_K */
+
+#if MLKEM_ETA1 == MLKEM_ETA2
+/*
+ * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
+ * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
+ * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
+ */
+#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
+#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
+
+#if MLKEM_K == 2 || MLKEM_K == 4
+#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
+/*************************************************
+ * Name:        poly_getnoise_eta2
+ *
+ * Description: Sample a polynomial deterministically from a seed and a nonce,
+ *              with output polynomial close to centered binomial distribution
+ *              with parameter MLKEM_ETA2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
+                        uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
+);
+#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
+
+#if MLKEM_K == 2
+#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
+/*************************************************
+ * Name:        poly_getnoise_eta1122_4x
+ *
+ * Description: Batch sample four polynomials deterministically from a seed
+ * and a nonces, with output polynomials close to centered binomial
+ * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
+ *
+ * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
+ *              - const uint8_t *seed: pointer to input seed
+ *                                     (of length MLKEM_SYMBYTES bytes)
+ *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
+                              const uint8_t seed[MLKEM_SYMBYTES],
+                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                              uint8_t nonce3)
+__contract__(
+  requires( /* r0, r1 consecutive, r2, r3 consecutive */
+ (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
+   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
+  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+);
+#endif /* MLKEM_K == 2 */
+
+#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.c
deleted file mode 100644
index 50ea1c34a..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.c
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "polyvec.h"
-#include <stdint.h>
-#include <string.h>
-#include "arith_backend.h"
-#include "cbd.h"
-#include "ntt.h"
-#include "poly.h"
-#include "symmetric.h"
-
-#include "debug.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define poly_cbd_eta1 MLKEM_NAMESPACE_K(poly_cbd_eta1)
-#define poly_cbd_eta2 MLKEM_NAMESPACE_K(poly_cbd_eta2)
-/* End of static namespacing */
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
-                         const polyvec *a)
-{
-  unsigned i;
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_decompress_du(polyvec *r,
-                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
-  }
-
-  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
-{
-  unsigned i;
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_tobytes(r + i * MLKEM_POLYBYTES, &a->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
-  }
-
-  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_ntt(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_ntt(&r->vec[i]);
-  }
-
-  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, NTT_BOUND);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_invntt_tomont(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_invntt_tomont(&r->vec[i]);
-  }
-
-  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, INVNTT_BOUND);
-}
-
-#if !defined(MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
-                                           const polyvec *b,
-                                           const polyvec_mulcache *b_cache)
-{
-  unsigned i;
-  poly t;
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
-
-  poly_basemul_montgomery_cached(r, &a->vec[0], &b->vec[0], &b_cache->vec[0]);
-  for (i = 1; i < MLKEM_K; i++)
-  {
-    poly_basemul_montgomery_cached(&t, &a->vec[i], &b->vec[i],
-                                   &b_cache->vec[i]);
-    poly_add(r, &t);
-  }
-
-  /*
-   * This bound is true for the C implementation, but not needed
-   * in the higher level bounds reasoning. It is thus omitted
-   * them from the spec to not unnecessarily constrain native
-   * implementations, but checked here nonetheless.
-   */
-  debug_assert_abs_bound(r, MLKEM_K, MLKEM_N * 2 * MLKEM_Q);
-}
-#else  /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
-                                           const polyvec *b,
-                                           const polyvec_mulcache *b_cache)
-{
-  debug_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, UINT12_LIMIT);
-  /* Omitting bounds assertion for cache since native implementations may
-   * decide not to use a mulcache. Note that the C backend implementation
-   * of poly_basemul_montgomery_cached() does still include the check. */
-  polyvec_basemul_acc_montgomery_cached_native(r, a, b, b_cache);
-}
-#endif /* MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-{
-  polyvec_mulcache b_cache;
-  polyvec_mulcache_compute(&b_cache, b);
-  polyvec_basemul_acc_montgomery_cached(r, a, b, &b_cache);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_mulcache_compute(&x->vec[i], &a->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_reduce(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_reduce(&r->vec[i]);
-  }
-
-  debug_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_add(polyvec *r, const polyvec *b)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_add(&r->vec[i], &b->vec[i]);
-  }
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tomont(polyvec *r)
-{
-  unsigned i;
-  for (i = 0; i < MLKEM_K; i++)
-  {
-    poly_tomont(&r->vec[i]);
-  }
-
-  debug_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
-}
-
-
-/*************************************************
- * Name:        poly_cbd_eta1
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static INLINE void poly_cbd_eta1(poly *r,
-                                 const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
-)
-{
-#if MLKEM_ETA1 == 2
-  poly_cbd2(r, buf);
-#elif MLKEM_ETA1 == 3
-  poly_cbd3(r, buf);
-#else
-#error "Invalid value of MLKEM_ETA1"
-#endif
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-{
-  ALIGN uint8_t buf0[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf1[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf3[MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t extkey0[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey1[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey2[MLKEM_SYMBYTES + 1];
-  ALIGN uint8_t extkey3[MLKEM_SYMBYTES + 1];
-  memcpy(extkey0, seed, MLKEM_SYMBYTES);
-  memcpy(extkey1, seed, MLKEM_SYMBYTES);
-  memcpy(extkey2, seed, MLKEM_SYMBYTES);
-  memcpy(extkey3, seed, MLKEM_SYMBYTES);
-  extkey0[MLKEM_SYMBYTES] = nonce0;
-  extkey1[MLKEM_SYMBYTES] = nonce1;
-  extkey2[MLKEM_SYMBYTES] = nonce2;
-  extkey3[MLKEM_SYMBYTES] = nonce3;
-  prf_eta1_x4(buf0, buf1, buf2, buf3, extkey0, extkey1, extkey2, extkey3);
-  poly_cbd_eta1(r0, buf0);
-  poly_cbd_eta1(r1, buf1);
-  poly_cbd_eta1(r2, buf2);
-  poly_cbd_eta1(r3, buf3);
-
-  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
-}
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-/*************************************************
- * Name:        poly_cbd_eta2
- *
- * Description: Given an array of uniformly random bytes, compute
- *              polynomial with coefficients distributed according to
- *              a centered binomial distribution with parameter MLKEM_ETA2.
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *buf: pointer to input byte array
- **************************************************/
-static INLINE void poly_cbd_eta2(poly *r,
-                                 const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
-{
-#if MLKEM_ETA2 == 2
-  poly_cbd2(r, buf);
-#else
-#error "Invalid value of MLKEM_ETA2"
-#endif
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-{
-  ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
-
-  memcpy(extkey, seed, MLKEM_SYMBYTES);
-  extkey[MLKEM_SYMBYTES] = nonce;
-  prf_eta2(buf, extkey);
-
-  poly_cbd_eta2(r, buf);
-
-  debug_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
-}
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-
-#if MLKEM_K == 2
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-{
-  ALIGN uint8_t buf1[KECCAK_WAY / 2][MLKEM_ETA1 * MLKEM_N / 4];
-  ALIGN uint8_t buf2[KECCAK_WAY / 2][MLKEM_ETA2 * MLKEM_N / 4];
-  ALIGN uint8_t extkey[KECCAK_WAY][MLKEM_SYMBYTES + 1];
-  memcpy(extkey[0], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[1], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[2], seed, MLKEM_SYMBYTES);
-  memcpy(extkey[3], seed, MLKEM_SYMBYTES);
-  extkey[0][MLKEM_SYMBYTES] = nonce0;
-  extkey[1][MLKEM_SYMBYTES] = nonce1;
-  extkey[2][MLKEM_SYMBYTES] = nonce2;
-  extkey[3][MLKEM_SYMBYTES] = nonce3;
-
-  prf_eta1(buf1[0], extkey[0]);
-  prf_eta1(buf1[1], extkey[1]);
-  prf_eta2(buf2[0], extkey[2]);
-  prf_eta2(buf2[1], extkey[3]);
-
-  poly_cbd_eta1(r0, buf1[0]);
-  poly_cbd_eta1(r1, buf1[1]);
-  poly_cbd_eta2(r2, buf2[0]);
-  poly_cbd_eta2(r3, buf2[1]);
-
-  debug_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
-  debug_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
-  debug_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
-}
-#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.h
deleted file mode 100644
index 8be8579e0..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/polyvec.h
+++ /dev/null
@@ -1,595 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef POLYVEC_H
-#define POLYVEC_H
-
-#include <stdint.h>
-#include "common.h"
-#include "poly.h"
-
-#define polyvec MLKEM_NAMESPACE_K(polyvec)
-typedef struct
-{
-  poly vec[MLKEM_K];
-} ALIGN polyvec;
-
-#define polyvec_mulcache MLKEM_NAMESPACE_K(polyvec_mulcache)
-typedef struct
-{
-  poly_mulcache vec[MLKEM_K];
-} polyvec_mulcache;
-
-#define poly_compress_du MLKEM_NAMESPACE_K(poly_compress_du)
-/*************************************************
- * Name:        poly_compress_du
- *
- * Description: Compression (du bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-static INLINE void poly_compress_du(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU],
-                                    const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
-{
-#if MLKEM_DU == 10
-  poly_compress_d10(r, a);
-#elif MLKEM_DU == 11
-  poly_compress_d11(r, a);
-#else
-#error "Invalid value of MLKEM_DU"
-#endif
-}
-
-#define poly_decompress_du MLKEM_NAMESPACE_K(poly_decompress_du)
-/*************************************************
- * Name:        poly_decompress_du
- *
- * Description: De-serialization and subsequent decompression (du bits) of a
- *              polynomial; approximate inverse of poly_compress_du
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                   (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-static INLINE void poly_decompress_du(
-    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(memory_slice(r, sizeof(poly)))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-#if MLKEM_DU == 10
-  poly_decompress_d10(r, a);
-#elif MLKEM_DU == 11
-  poly_decompress_d11(r, a);
-#else
-#error "Invalid value of MLKEM_DU"
-#endif
-}
-
-#define poly_compress_dv MLKEM_NAMESPACE_K(poly_compress_dv)
-/*************************************************
- * Name:        poly_compress_dv
- *
- * Description: Compression (dv bits) and subsequent serialization of a
- *              polynomial
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
- *              - const poly *a: pointer to input polynomial
- *                  Coefficients must be unsigned canonical,
- *                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-static INLINE void poly_compress_dv(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV],
-                                    const poly *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(a, sizeof(poly)))
-  requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  assigns(object_whole(r)))
-{
-#if MLKEM_DV == 4
-  poly_compress_d4(r, a);
-#elif MLKEM_DV == 5
-  poly_compress_d5(r, a);
-#else
-#error "Invalid value of MLKEM_DV"
-#endif
-}
-
-
-#define poly_decompress_dv MLKEM_NAMESPACE_K(poly_decompress_dv)
-/*************************************************
- * Name:        poly_decompress_dv
- *
- * Description: De-serialization and subsequent decompression (dv bits) of a
- *              polynomial; approximate inverse of poly_compress
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *a: pointer to input byte array
- *                  (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
- *
- * Upon return, the coefficients of the output polynomial are unsigned-canonical
- * (non-negative and smaller than MLKEM_Q).
- *
- **************************************************/
-static INLINE void poly_decompress_dv(
-    poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
-  requires(memory_no_alias(r, sizeof(poly)))
-  assigns(object_whole(r))
-  ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-{
-#if MLKEM_DV == 4
-  poly_decompress_d4(r, a);
-#elif MLKEM_DV == 5
-  poly_decompress_d5(r, a);
-#else
-#error "Invalid value of MLKEM_DV"
-#endif
-}
-
-#define polyvec_compress_du MLKEM_NAMESPACE_K(polyvec_compress_du)
-/*************************************************
- * Name:        polyvec_compress_du
- *
- * Description: Compress and serialize vector of polynomials
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU)
- *              - const polyvec *a: pointer to input vector of polynomials.
- *                                  Coefficients must be unsigned canonical,
- *                                  i.e. in [0,1,..,MLKEM_Q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
-                         const polyvec *a)
-__contract__(
-  requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(forall(k0, 0, MLKEM_K,
-         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-  assigns(object_whole(r))
-);
-
-#define polyvec_decompress_du MLKEM_NAMESPACE_K(polyvec_decompress_du)
-/*************************************************
- * Name:        polyvec_decompress_du
- *
- * Description: De-serialize and decompress vector of polynomials;
- *              approximate inverse of polyvec_compress_du
- *
- * Arguments:   - polyvec *r:       pointer to output vector of polynomials.
- *                Output will have coefficients normalized to [0,..,q-1].
- *              - const uint8_t *a: pointer to input byte array
- *                                  (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU)
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_decompress_du(polyvec *r,
-                           const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
-__contract__(
-  requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(k0, 0, MLKEM_K,
-         array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-);
-
-#define polyvec_tobytes MLKEM_NAMESPACE_K(polyvec_tobytes)
-/*************************************************
- * Name:        polyvec_tobytes
- *
- * Description: Serialize vector of polynomials
- *
- * Arguments:   - uint8_t *r: pointer to output byte array
- *                            (needs space for MLKEM_POLYVECBYTES)
- *              - const polyvec *a: pointer to input vector of polynomials
- *                  Each polynomial must have coefficients in [0,..,q-1].
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const polyvec *a)
-__contract__(
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
-  requires(forall(k0, 0, MLKEM_K,
-         array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-  assigns(object_whole(r))
-);
-
-#define polyvec_frombytes MLKEM_NAMESPACE_K(polyvec_frombytes)
-/*************************************************
- * Name:        polyvec_frombytes
- *
- * Description: De-serialize vector of polynomials;
- *              inverse of polyvec_tobytes
- *
- * Arguments:   - const polyvec *a: pointer to output vector of polynomials
- *                 (of length MLKEM_POLYVECBYTES). Output will have coefficients
- *                 normalized in [0..4095].
- *              - uint8_t *r: pointer to input byte array
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
-  assigns(object_whole(r))
-  ensures(forall(k0, 0, MLKEM_K,
-        array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
-);
-
-#define polyvec_ntt MLKEM_NAMESPACE_K(polyvec_ntt)
-/*************************************************
- * Name:        polyvec_ntt
- *
- * Description: Apply forward NTT to all elements of a vector of polynomials.
- *
- *              The input is assumed to be in normal order and
- *              coefficient-wise bound by MLKEM_Q in absolute value.
- *
- *              The output polynomial is in bitreversed order, and
- *              coefficient-wise bound by NTT_BOUND in absolute value.
- *
- * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_ntt(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  requires(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
-  assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, NTT_BOUND)))
-);
-
-#define polyvec_invntt_tomont MLKEM_NAMESPACE_K(polyvec_invntt_tomont)
-/*************************************************
- * Name:        polyvec_invntt_tomont
- *
- * Description: Apply inverse NTT to all elements of a vector of polynomials
- *              and multiply by Montgomery factor 2^16
- *
- *              The input is assumed to be in bitreversed order, and can
- *              have arbitrary coefficients in int16_t.
- *
- *              The output polynomial is in normal order, and
- *              coefficient-wise bound by INVNTT_BOUND in absolute value.
- *
- *
- * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_invntt_tomont(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, INVNTT_BOUND)))
-);
-
-#define polyvec_basemul_acc_montgomery \
-  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery)
-/*************************************************
- * Name:        polyvec_basemul_acc_montgomery
- *
- * Description: Multiply elements of a and b in NTT domain, accumulate into r,
- *              and multiply by 2^-16.
- *
- * Arguments: - poly *r: pointer to output polynomial
- *            - const polyvec *a: pointer to first input vector of polynomials
- *            - const polyvec *b: pointer to second input vector of polynomials
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(memory_no_alias(b, sizeof(polyvec)))
-  requires(forall(k1, 0, MLKEM_K,
-    array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
-  assigns(memory_slice(r, sizeof(poly)))
-);
-
-
-#define polyvec_basemul_acc_montgomery_cached \
-  MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
-/*************************************************
- * Name:        polyvec_basemul_acc_montgomery_cached
- *
- * Description: Scalar product of two vectors of polynomials in NTT domain,
- *              using mulcache for second operand.
- *
- *              Bounds:
- *              - Every coefficient of a is assumed to be in [0..4095]
- *              - No bounds guarantees for the coefficients in the result.
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const polyvec *a: pointer to first input polynomial vector
- *              - const polyvec *b: pointer to second input polynomial vector
- *              - const polyvec_mulcache *b_cache: pointer to mulcache
- *                  for second input polynomial vector. Can be computed
- *                  via polyvec_mulcache_compute().
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a,
-                                           const polyvec *b,
-                                           const polyvec_mulcache *b_cache)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  requires(memory_no_alias(b, sizeof(polyvec)))
-  requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache)))
-  requires(forall(k1, 0, MLKEM_K,
-     array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, UINT12_LIMIT)))
-  assigns(memory_slice(r, sizeof(poly)))
-);
-
-#define polyvec_mulcache_compute MLKEM_NAMESPACE_K(polyvec_mulcache_compute)
-/************************************************************
- * Name: polyvec_mulcache_compute
- *
- * Description: Computes the mulcache for a vector of polynomials in NTT domain
- *
- *              The mulcache of a degree-2 polynomial b := b0 + b1*X
- *              in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
- *              computing products of b in Fq[X]/(X^2-zeta).
- *
- *              The mulcache of a polynomial in NTT domain -- which is
- *              a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
- *              for varying zeta, is the 128-tuple of mulcaches of those
- *              polynomials.
- *
- *              The mulcache of a vector of polynomials is the vector
- *              of mulcaches of its entries.
- *
- * Arguments: - x: Pointer to mulcache to be populated
- *            - a: Pointer to input polynomial vector
- ************************************************************/
-/*
- * NOTE: The default C implementation of this function populates
- * the mulcache with values in (-q,q), but this is not needed for the
- * higher level safety proofs, and thus not part of the spec.
- */
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_mulcache_compute(polyvec_mulcache *x, const polyvec *a)
-__contract__(
-  requires(memory_no_alias(x, sizeof(polyvec_mulcache)))
-  requires(memory_no_alias(a, sizeof(polyvec)))
-  assigns(object_whole(x))
-);
-
-#define polyvec_reduce MLKEM_NAMESPACE_K(polyvec_reduce)
-/*************************************************
- * Name:        polyvec_reduce
- *
- * Description: Applies Barrett reduction to each coefficient
- *              of each element of a vector of polynomials;
- *              for details of the Barrett reduction see comments in reduce.c
- *
- * Arguments:   - polyvec *r: pointer to input/output polynomial
- **************************************************/
-/*
- * NOTE: The semantics of polyvec_reduce() is different in
- *       the reference implementation, which requires
- *       signed canonical output data. Unsigned canonical
- *       outputs are better suited to the only remaining
- *       use of poly_reduce() in the context of (de)serialization.
- */
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_reduce(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(k0, 0, MLKEM_K,
-    array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
-);
-
-#define polyvec_add MLKEM_NAMESPACE_K(polyvec_add)
-/*************************************************
- * Name:        polyvec_add
- *
- * Description: Add vectors of polynomials
- *
- * Arguments: - polyvec *r: pointer to input-output vector of polynomials to be
- *              added to
- *            - const polyvec *b: pointer to second input vector of polynomials
- *
- * The coefficients of r and b must be so that the addition does
- * not overflow. Otherwise, the behaviour of this function is undefined.
- *
- * The coefficients returned in *r are in int16_t which is sufficient
- * to prove type-safety of calling units. Therefore, no stronger
- * ensures clause is required on this function.
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_add(polyvec *r, const polyvec *b)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  requires(memory_no_alias(b, sizeof(polyvec)))
-  requires(forall(j0, 0, MLKEM_K,
-          forall(k0, 0, MLKEM_N,
-            (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
-  requires(forall(j1, 0, MLKEM_K,
-          forall(k1, 0, MLKEM_N,
-            (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
-  assigns(object_whole(r))
-);
-
-#define polyvec_tomont MLKEM_NAMESPACE_K(polyvec_tomont)
-/*************************************************
- * Name:        polyvec_tomont
- *
- * Description: Inplace conversion of all coefficients of a polynomial
- *              vector from normal domain to Montgomery domain
- *
- *              Bounds: Output < q in absolute value.
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void polyvec_tomont(polyvec *r)
-__contract__(
-  requires(memory_no_alias(r, sizeof(polyvec)))
-  assigns(memory_slice(r, sizeof(polyvec)))
-  assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-    array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
-);
-
-#define poly_getnoise_eta1_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and nonces, with output polynomials close to centered binomial distribution
- * with parameter MLKEM_ETA1.
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                           const uint8_t seed[MLKEM_SYMBYTES], uint8_t nonce0,
-                           uint8_t nonce1, uint8_t nonce2, uint8_t nonce3)
-/* Depending on MLKEM_K, the pointers passed to this function belong
-   to the same objects, so we cannot use memory_no_alias for r0-r3.
-
-   NOTE: Somehow it is important to use memory_no_alias() first in the
-         conjunctions defining each case.
-*/
-#if MLKEM_K == 2
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case A: r0, r1 consecutive, r2, r3 consecutive */
-    (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-     r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 4
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case B: r0, r1, r2, r3 consecutive */
-    (memory_no_alias(r0, 4 * sizeof(poly)) && r1 == r0 + 1 && r2 == r0 + 2 && r3 == r0 + 3))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#elif MLKEM_K == 3
-__contract__(
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  requires( /* Case C: r0, r1, r2 consecutive */
- (memory_no_alias(r0, 3 * sizeof(poly)) && memory_no_alias(r3, 1 * sizeof(poly)) &&
-  r1 == r0 + 1 && r2 == r0 + 2 && !same_object(r3, r0)))
-  assigns(memory_slice(r0, sizeof(poly)))
-  assigns(memory_slice(r1, sizeof(poly)))
-  assigns(memory_slice(r2, sizeof(poly)))
-  assigns(memory_slice(r3, sizeof(poly)))
-  ensures(
-    array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-    && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
-);
-#endif /* MLKEM_K */
-
-#if MLKEM_ETA1 == MLKEM_ETA2
-/*
- * We only require poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
- * where MLKEM_ETA2 = MLKEM_ETA1 = 2.
- * For ml-kem-512, poly_getnoise_eta1122_4x is used instead.
- */
-#define poly_getnoise_eta2_4x poly_getnoise_eta1_4x
-#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
-
-#if MLKEM_K == 2 || MLKEM_K == 4
-#define poly_getnoise_eta2 MLKEM_NAMESPACE_K(poly_getnoise_eta2)
-/*************************************************
- * Name:        poly_getnoise_eta2
- *
- * Description: Sample a polynomial deterministically from a seed and a nonce,
- *              with output polynomial close to centered binomial distribution
- *              with parameter MLKEM_ETA2
- *
- * Arguments:   - poly *r: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta2(poly *r, const uint8_t seed[MLKEM_SYMBYTES],
-                        uint8_t nonce)
-__contract__(
-  requires(memory_no_alias(r, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r))
-  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
-);
-#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
-
-#if MLKEM_K == 2
-#define poly_getnoise_eta1122_4x MLKEM_NAMESPACE_K(poly_getnoise_eta1122_4x)
-/*************************************************
- * Name:        poly_getnoise_eta1122_4x
- *
- * Description: Batch sample four polynomials deterministically from a seed
- * and a nonces, with output polynomials close to centered binomial
- * distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
- *
- * Arguments:   - poly *r{0,1,2,3}: pointer to output polynomial
- *              - const uint8_t *seed: pointer to input seed
- *                                     (of length MLKEM_SYMBYTES bytes)
- *              - uint8_t nonce{0,1,2,3}: one-byte input nonce
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_getnoise_eta1122_4x(poly *r0, poly *r1, poly *r2, poly *r3,
-                              const uint8_t seed[MLKEM_SYMBYTES],
-                              uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
-                              uint8_t nonce3)
-__contract__(
-  requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(poly)) && memory_no_alias(r2, 2 * sizeof(poly)) &&
-   r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES))
-  assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
-  ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
-     && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
-     && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
-);
-#endif /* MLKEM_K == 2 */
-
-#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/reduce.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/reduce.h
deleted file mode 100644
index b432a4201..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/reduce.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include <stdint.h>
-#include "cbmc.h"
-#include "common.h"
-#include "debug.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define cast_uint16_to_int16 MLKEM_NAMESPACE(cast_uint16_to_int16)
-#define montgomery_reduce_generic MLKEM_NAMESPACE(montgomery_reduce_generic)
-#define montgomery_reduce MLKEM_NAMESPACE(montgomery_reduce)
-#define fqmul MLKEM_NAMESPACE(fqmul)
-#define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)
-/* End of static namespacing */
-
-#define HALF_Q ((MLKEM_Q + 1) / 2) /* 1665 */
-
-/*************************************************
- * Name:        cast_uint16_to_int16
- *
- * Description: Cast uint16 value to int16
- *
- * Returns:
- *   input x in     0 .. 32767: returns value unchanged
- *   input x in 32768 .. 65535: returns (x - 65536)
- **************************************************/
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
-ALWAYS_INLINE
-static INLINE int16_t cast_uint16_to_int16(uint16_t x)
-{
-  /*
-   * PORTABILITY: This relies on uint16_t -> int16_t
-   * being implemented as the inverse of int16_t -> uint16_t,
-   * which is implementation-defined (C99 6.3.1.3 (3))
-   * CBMC (correctly) fails to prove this conversion is OK,
-   * so we have to suppress that check here
-   */
-  return (int16_t)x;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/*************************************************
- * Name:        montgomery_reduce_generic
- *
- * Description: Generic Montgomery reduction; given a 32-bit integer a, computes
- *              16-bit integer congruent to a * R^-1 mod q, where R=2^16
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *
- * Returns:     integer congruent to a * R^-1 modulo q, with absolute value
- *                <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
- *
- **************************************************/
-ALWAYS_INLINE
-static INLINE int16_t montgomery_reduce_generic(int32_t a)
-{
-  /* QINV == -3327 converted to uint16_t == -3327 + 65536 == 62209 */
-  const uint32_t QINV = 62209; /* q^-1 mod 2^16 */
-
-  /*  Compute a*q^{-1} mod 2^16 in unsigned representatives */
-  const uint16_t a_reduced = a & UINT16_MAX;
-  const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
-
-  /* Lift to signed canonical representative mod 2^16. */
-  const int16_t t = cast_uint16_to_int16(a_inverted);
-
-  int32_t r = a - ((int32_t)t * MLKEM_Q);
-  /* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */
-
-  /*
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  r = r >> 16;
-  /* Bounds: |r >> 16| <= ceil(|r| / 2^16)
-   *                   <= ceil(|a| / 2^16 + MLKEM_Q / 2)
-   *                   <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *
-   * (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
-   */
-
-  return (int16_t)r;
-}
-
-/*************************************************
- * Name:        montgomery_reduce
- *
- * Description: Montgomery reduction
- *
- * Arguments:   - int32_t a: input integer to be reduced
- *                  Must be smaller than 2 * 2^12 * 2^15 in absolute value.
- *
- * Returns:     integer congruent to a * R^-1 modulo q,
- *              smaller than 2 * q in absolute value.
- **************************************************/
-static INLINE int16_t montgomery_reduce(int32_t a)
-__contract__(
-  requires(a > -(2 * UINT12_LIMIT * 32768))
-  requires(a <  (2 * UINT12_LIMIT * 32768))
-  ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&a, 1, 2 * UINT12_LIMIT * 32768);
-
-  res = montgomery_reduce_generic(a);
-  /* Bounds:
-   * |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2 * UINT12_LIMIT * 32768 / 65536) + (MLKEM_Q + 1) / 2
-   *       <= UINT12_LIMIT + (MLKEM_Q + 1) / 2
-   *        < 2 * MLKEM_Q */
-
-  debug_assert_abs_bound(&res, 1, 2 * MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        fqmul
- *
- * Description: Montgomery multiplication modulo q=3329
- *
- * Arguments:   - int16_t a: first factor
- *                  Can be any int16_t.
- *              - int16_t b: second factor.
- *                  Must be signed canonical (abs value <(q+1)/2)
- *
- * Returns 16-bit integer congruent to a*b*R^{-1} mod q, and
- * smaller than q in absolute value.
- *
- **************************************************/
-static INLINE int16_t fqmul(int16_t a, int16_t b)
-__contract__(
-  requires(b > -HALF_Q)
-  requires(b < HALF_Q)
-  ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
-)
-{
-  int16_t res;
-  debug_assert_abs_bound(&b, 1, HALF_Q);
-
-  res = montgomery_reduce((int32_t)a * (int32_t)b);
-  /* Bounds:
-   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
-   *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
-   *        < MLKEM_Q
-   */
-
-  debug_assert_abs_bound(&res, 1, MLKEM_Q);
-  return res;
-}
-
-/*************************************************
- * Name:        barrett_reduce
- *
- * Description: Barrett reduction; given a 16-bit integer a, computes
- *              centered representative congruent to a mod q in
- *              {-(q-1)/2,...,(q-1)/2}
- *
- * Arguments:   - int16_t a: input integer to be reduced
- *
- * Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
- **************************************************/
-static INLINE int16_t barrett_reduce(int16_t a)
-__contract__(
-  ensures(return_value > -HALF_Q && return_value < HALF_Q)
-)
-{
-  /*
-   * To divide by MLKEM_Q using Barrett multiplication, the "magic number"
-   * multiplier is round_to_nearest(2**26/MLKEM_Q)
-   */
-  const int BPOWER = 26;
-  const int32_t barrett_multiplier = ((1 << BPOWER) + MLKEM_Q / 2) / MLKEM_Q;
-
-  /*
-   * Compute round_to_nearest(a/MLKEM_Q) using the multiplier
-   * above and shift by BPOWER places.
-   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
-   * implementation-defined for negative left argument. Here,
-   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
-   */
-  const int32_t t = (barrett_multiplier * a + (1 << (BPOWER - 1))) >> BPOWER;
-
-  /*
-   * t is in -10 .. +10, so we need 32-bit math to
-   * evaluate t * MLKEM_Q and the subsequent subtraction
-   */
-  int16_t res = (int16_t)(a - t * MLKEM_Q);
-
-  debug_assert_abs_bound(&res, 1, HALF_Q);
-  return res;
-}
-
-#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.c
deleted file mode 100644
index cbbe4407f..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#include "common.h"
-#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-
-#include "arith_backend.h"
-#include "debug.h"
-#include "fips202.h"
-#include "fips202x4.h"
-#include "rej_uniform.h"
-#include "symmetric.h"
-
-/* Static namespacing
- * This is to facilitate building multiple instances
- * of mlkem-native (e.g. with varying security levels)
- * within a single compilation unit. */
-#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
-#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
-/* End of static namespacing */
-
-static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
-                                       unsigned int offset, const uint8_t *buf,
-                                       unsigned int buflen)
-__contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
-{
-  unsigned int ctr, pos;
-  uint16_t val0, val1;
-
-  debug_assert_bound(r, offset, 0, MLKEM_Q);
-
-  ctr = offset;
-  pos = 0;
-  /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
-  while (ctr < target && pos + 3 <= buflen)
-  __loop__(
-    invariant(offset <= ctr && ctr <= target && pos <= buflen)
-    invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q)))
-  {
-    val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
-    val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
-    pos += 3;
-
-    if (val0 < MLKEM_Q)
-    {
-      r[ctr++] = val0;
-    }
-    if (ctr < target && val1 < MLKEM_Q)
-    {
-      r[ctr++] = val1;
-    }
-  }
-
-  debug_assert_bound(r, ctr, 0, MLKEM_Q);
-  return ctr;
-}
-
-#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
-/*************************************************
- * Name:        rej_uniform
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- *              uniform random integers mod q
- *
- * Arguments:   - int16_t *r:          pointer to output buffer
- *              - unsigned int target: requested number of 16-bit integers
- *                                     (uniform mod q).
- *                                     Must be <= 4096.
- *              - unsigned int offset: number of 16-bit integers that have
- *                                     already been sampled.
- *                                     Must be <= target.
- *              - const uint8_t *buf:  pointer to input buffer
- *                                     (assumed to be uniform random bytes)
- *              - unsigned int buflen: length of input buffer in bytes
- *                                     Must be <= 4096.
- *                                     Must be a multiple of 3.
- *
- * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
- * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
- *
- * Returns the new offset of sampled 16-bit integers, at most target,
- * and at least the initial offset.
- * If the new offset is strictly less than len, all of the input buffers
- * is guaranteed to have been consumed. If it is equal to len, no information
- * is provided on how many bytes of the input buffer have been consumed.
- **************************************************/
-
-/*
- * NOTE: The signature differs from the Kyber reference implementation
- * in that it adds the offset and always expects the base of the target
- * buffer. This avoids shifting the buffer base in the caller, which appears
- * tricky to reason about.
- */
-static unsigned int rej_uniform(int16_t *r, unsigned int target,
-                                unsigned int offset, const uint8_t *buf,
-                                unsigned int buflen)
-__contract__(
-  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
-  requires(memory_no_alias(r, sizeof(int16_t) * target))
-  requires(memory_no_alias(buf, buflen))
-  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
-  assigns(memory_slice(r, sizeof(int16_t) * target))
-  ensures(offset <= return_value && return_value <= target)
-  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
-{
-  return rej_uniform_scalar(r, target, offset, buf, buflen);
-}
-#else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-static unsigned int rej_uniform(int16_t *r, unsigned int target,
-                                unsigned int offset, const uint8_t *buf,
-                                unsigned int buflen)
-{
-  int ret;
-
-  /* Sample from large buffer with full lane as much as possible. */
-  ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
-  if (ret != -1)
-  {
-    unsigned res = offset + (unsigned)ret;
-    debug_assert_bound(r, res, 0, MLKEM_Q);
-    return res;
-  }
-
-  return rej_uniform_scalar(r, target, offset, buf, buflen);
-}
-#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
-
-#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
-  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
-#endif
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
-{
-  /* Temporary buffers for XOF output before rejection sampling */
-  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-
-  /* Tracks the number of coefficients we have already sampled */
-  unsigned int ctr[KECCAK_WAY];
-  xof_x4_ctx statex;
-  unsigned int buflen;
-
-  shake128x4_inc_init(&statex);
-
-  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
-  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
-                MLKEM_SYMBYTES + 2);
-
-  /*
-   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   * This should generate the matrix entries with high probability.
-   */
-  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
-                       &statex);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
-  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
-  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
-  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
-
-  /*
-   * So long as not all matrix entries have been generated, squeeze
-   * one more block a time until we're done.
-   */
-  buflen = XOF_RATE;
-  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
-         ctr[3] < MLKEM_N)
-  __loop__(
-    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
-       object_whole(buf1), object_whole(buf2), object_whole(buf3))
-    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
-    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
-    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
-    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
-    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
-    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
-  {
-    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
-    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
-    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
-    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
-    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
-  }
-
-  xof_x4_release(&statex);
-}
-
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-{
-  xof_ctx state;
-  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
-  unsigned int ctr, buflen;
-
-  shake128_inc_init(&state);
-
-  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
-
-  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
-   */
-  /* This should generate the matrix entry with high probability. */
-  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
-  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
-  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
-
-  /* Squeeze + sample one more block a time until we're done */
-  buflen = XOF_RATE;
-  while (ctr < MLKEM_N)
-  __loop__(
-    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
-    invariant(ctr <= MLKEM_N)
-    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
-  {
-    xof_squeezeblocks(buf, 1, &state);
-    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
-  }
-
-  xof_release(&state);
-}
-
-#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
-
-#define empty_cu_rej_uniform MLKEM_NAMESPACE_K(empty_cu_rej_uniform)
-int empty_cu_rej_uniform;
-
-#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.h
deleted file mode 100644
index 801287259..000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/rej_uniform.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2024 The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef REJ_UNIFORM_H
-#define REJ_UNIFORM_H
-
-#include <stdint.h>
-#include <stdlib.h>
-#include "cbmc.h"
-#include "common.h"
-#include "poly.h"
-
-#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
-/*************************************************
- * Name:        poly_rej_uniform_x4
- *
- * Description: Generate four polynomials using rejection sampling
- *              on (pseudo-)uniformly random bytes sampled from a seed.
- *
- * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
- *                                     to be sampled.
- *              - uint8_t *seed[4]:    Pointer to array of four pointers
- *                                     pointing to the seed buffers of size
- *                                     MLKEM_SYMBYTES + 2 each.
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
-__contract__(
-  requires(memory_no_alias(vec, sizeof(poly) * 4))
-  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
-  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
-  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(vec, sizeof(poly) * 4))
-  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
-  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
-
-#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
-/*************************************************
- * Name:        poly_rej_uniform
- *
- * Description: Generate polynomial using rejection sampling
- *              on (pseudo-)uniformly random bytes sampled from a seed.
- *
- * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
- *              - uint8_t *seed:       Pointer to seed buffer of size
- *                                     MLKEM_SYMBYTES + 2 each.
- *
- **************************************************/
-MLKEM_NATIVE_INTERNAL_API
-void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
-__contract__(
-  requires(memory_no_alias(entry, sizeof(poly)))
-  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
-  assigns(memory_slice(entry, sizeof(poly)))
-  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
-
-#endif /* REJ_UNIFORM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/sampling.c
new file mode 100644
index 000000000..98cbdcb74
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/sampling.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "common.h"
+#if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
+
+#include "arith_backend.h"
+#include "debug.h"
+#include "fips202.h"
+#include "fips202x4.h"
+#include "sampling.h"
+#include "symmetric.h"
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define rej_uniform MLKEM_NAMESPACE(rej_uniform)
+#define rej_uniform_scalar MLKEM_NAMESPACE(rej_uniform_scalar)
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
+/* End of static namespacing */
+
+static unsigned int rej_uniform_scalar(int16_t *r, unsigned int target,
+                                       unsigned int offset, const uint8_t *buf,
+                                       unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
+{
+  unsigned int ctr, pos;
+  uint16_t val0, val1;
+
+  debug_assert_bound(r, offset, 0, MLKEM_Q);
+
+  ctr = offset;
+  pos = 0;
+  /* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
+  while (ctr < target && pos + 3 <= buflen)
+  __loop__(
+    invariant(offset <= ctr && ctr <= target && pos <= buflen)
+    invariant(ctr > 0 ==> array_bound(r, 0, ctr, 0, MLKEM_Q)))
+  {
+    val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
+    val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+    pos += 3;
+
+    if (val0 < MLKEM_Q)
+    {
+      r[ctr++] = val0;
+    }
+    if (ctr < target && val1 < MLKEM_Q)
+    {
+      r[ctr++] = val1;
+    }
+  }
+
+  debug_assert_bound(r, ctr, 0, MLKEM_Q);
+  return ctr;
+}
+
+#if !defined(MLKEM_USE_NATIVE_REJ_UNIFORM)
+/*************************************************
+ * Name:        rej_uniform
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ *              uniform random integers mod q
+ *
+ * Arguments:   - int16_t *r:          pointer to output buffer
+ *              - unsigned int target: requested number of 16-bit integers
+ *                                     (uniform mod q).
+ *                                     Must be <= 4096.
+ *              - unsigned int offset: number of 16-bit integers that have
+ *                                     already been sampled.
+ *                                     Must be <= target.
+ *              - const uint8_t *buf:  pointer to input buffer
+ *                                     (assumed to be uniform random bytes)
+ *              - unsigned int buflen: length of input buffer in bytes
+ *                                     Must be <= 4096.
+ *                                     Must be a multiple of 3.
+ *
+ * Note: Strictly speaking, only a few values of buflen near UINT_MAX need
+ * excluding. The limit of 4096 is somewhat arbitary but sufficient for all
+ * uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
+ *
+ * Returns the new offset of sampled 16-bit integers, at most target,
+ * and at least the initial offset.
+ * If the new offset is strictly less than len, all of the input buffers
+ * is guaranteed to have been consumed. If it is equal to len, no information
+ * is provided on how many bytes of the input buffer have been consumed.
+ **************************************************/
+
+/*
+ * NOTE: The signature differs from the Kyber reference implementation
+ * in that it adds the offset and always expects the base of the target
+ * buffer. This avoids shifting the buffer base in the caller, which appears
+ * tricky to reason about.
+ */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
+  requires(memory_no_alias(r, sizeof(int16_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(offset > 0 ==> array_bound(r, 0, offset, 0, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(return_value > 0 ==> array_bound(r, 0, return_value, 0, MLKEM_Q))
+)
+{
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#else  /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+static unsigned int rej_uniform(int16_t *r, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+{
+  int ret;
+
+  /* Sample from large buffer with full lane as much as possible. */
+  ret = rej_uniform_native(r + offset, target - offset, buf, buflen);
+  if (ret != -1)
+  {
+    unsigned res = offset + (unsigned)ret;
+    debug_assert_bound(r, res, 0, MLKEM_Q);
+    return res;
+  }
+
+  return rej_uniform_scalar(r, target, offset, buf, buflen);
+}
+#endif /* MLKEM_USE_NATIVE_REJ_UNIFORM */
+
+#ifndef MLKEM_GEN_MATRIX_NBLOCKS
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+  ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + XOF_RATE) / XOF_RATE)
+#endif
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  uint8_t buf0[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf1[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf2[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  uint8_t buf3[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned int ctr[KECCAK_WAY];
+  xof_x4_ctx statex;
+  unsigned int buflen;
+
+  shake128x4_inc_init(&statex);
+
+  /* seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16 */
+  xof_x4_absorb(&statex, seed[0], seed[1], seed[2], seed[3],
+                MLKEM_SYMBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+  xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, MLKEM_GEN_MATRIX_NBLOCKS,
+                       &statex);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = XOF_RATE;
+  while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
+         ctr[3] < MLKEM_N)
+  __loop__(
+    assigns(ctr, statex, memory_slice(vec, sizeof(poly) * 4), object_whole(buf0),
+       object_whole(buf1), object_whole(buf2), object_whole(buf3))
+    invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    invariant(ctr[0] > 0 ==> array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
+    invariant(ctr[1] > 0 ==> array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
+    invariant(ctr[2] > 0 ==> array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
+    invariant(ctr[3] > 0 ==> array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+  {
+    xof_x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+    ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+    ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+    ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+    ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
+  }
+
+  xof_x4_release(&statex);
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+{
+  xof_ctx state;
+  uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE];
+  unsigned int ctr, buflen;
+
+  shake128_inc_init(&state);
+
+  xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
+
+  /* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
+   */
+  /* This should generate the matrix entry with high probability. */
+  xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
+  buflen = MLKEM_GEN_MATRIX_NBLOCKS * XOF_RATE;
+  ctr = rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
+
+  /* Squeeze + sample one more block a time until we're done */
+  buflen = XOF_RATE;
+  while (ctr < MLKEM_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(entry, sizeof(poly)), object_whole(buf))
+    invariant(ctr <= MLKEM_N)
+    invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
+  {
+    xof_squeezeblocks(buf, 1, &state);
+    ctr = rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
+  }
+
+  xof_release(&state);
+}
+
+/* Static namespacing
+ * This is to facilitate building multiple instances
+ * of mlkem-native (e.g. with varying security levels)
+ * within a single compilation unit. */
+#define load32_littleendian MLKEM_NAMESPACE(load32_littleendian)
+#define load24_littleendian MLKEM_NAMESPACE(load24_littleendian)
+/* End of static namespacing */
+
+/*************************************************
+ * Name:        load32_littleendian
+ *
+ * Description: load 4 bytes into a 32-bit integer
+ *              in little-endian order
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x
+ **************************************************/
+static uint32_t load32_littleendian(const uint8_t x[4])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  r |= (uint32_t)x[3] << 24;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 8; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 8)
+    invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
+  {
+    unsigned j;
+    uint32_t t = load32_littleendian(buf + 4 * i);
+    uint32_t d = t & 0x55555555;
+    d += (t >> 1) & 0x55555555;
+
+    for (j = 0; j < 8; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 8 && j <= 8)
+      invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
+    {
+      const int16_t a = (d >> (4 * j + 0)) & 0x3;
+      const int16_t b = (d >> (4 * j + 2)) & 0x3;
+      r->coeffs[8 * i + j] = a - b;
+    }
+  }
+}
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+/*************************************************
+ * Name:        load24_littleendian
+ *
+ * Description: load 3 bytes into a 32-bit integer
+ *              in little-endian order.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - const uint8_t *x: pointer to input byte array
+ *
+ * Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+ **************************************************/
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4])
+{
+  unsigned i;
+  for (i = 0; i < MLKEM_N / 4; i++)
+  __loop__(
+    invariant(i <= MLKEM_N / 4)
+    invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
+  {
+    unsigned j;
+    const uint32_t t = load24_littleendian(buf + 3 * i);
+    uint32_t d = t & 0x00249249;
+    d += (t >> 1) & 0x00249249;
+    d += (t >> 2) & 0x00249249;
+
+    for (j = 0; j < 4; j++)
+    __loop__(
+      invariant(i <= MLKEM_N / 4 && j <= 4)
+      invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
+    {
+      const int16_t a = (d >> (6 * j + 0)) & 0x7;
+      const int16_t b = (d >> (6 * j + 3)) & 0x7;
+      r->coeffs[4 * i + j] = a - b;
+    }
+  }
+}
+#endif /* defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == \
+          3 */
+
+#else /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
+
+#define empty_cu_sampling MLKEM_NAMESPACE_K(empty_cu_sampling)
+int empty_cu_sampling;
+
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/sampling.h
new file mode 100644
index 000000000..cc524e0fc
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/sampling.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2024 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef SAMPLING_H
+#define SAMPLING_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "cbmc.h"
+#include "common.h"
+#include "poly.h"
+
+#define poly_cbd2 MLKEM_NAMESPACE(poly_cbd2)
+/*************************************************
+ * Name:        poly_cbd2
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=2
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd2(poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
+
+#if defined(MLKEM_NATIVE_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_ETA1 == 3
+#define poly_cbd3 MLKEM_NAMESPACE(poly_cbd3)
+/*************************************************
+ * Name:        poly_cbd3
+ *
+ * Description: Given an array of uniformly random bytes, compute
+ *              polynomial with coefficients distributed according to
+ *              a centered binomial distribution with parameter eta=3.
+ *              This function is only needed for ML-KEM-512
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const uint8_t *buf: pointer to input byte array
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_cbd3(poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
+#endif /* MLKEM_NATIVE_MULTILEVEL_BUILD || MLKEM_ETA1 == 3 */
+
+#define poly_rej_uniform_x4 MLKEM_NAMESPACE(poly_rej_uniform_x4)
+/*************************************************
+ * Name:        poly_rej_uniform_x4
+ *
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to an array of 4 polynomials
+ *                                     to be sampled.
+ *              - uint8_t *seed[4]:    Pointer to array of four pointers
+ *                                     pointing to the seed buffers of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform_x4(poly *vec, uint8_t *seed[4])
+__contract__(
+  requires(memory_no_alias(vec, sizeof(poly) * 4))
+  requires(memory_no_alias(seed, sizeof(uint8_t*) * 4))
+  requires(memory_no_alias(seed[0], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[1], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[2], MLKEM_SYMBYTES + 2))
+  requires(memory_no_alias(seed[3], MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(vec, sizeof(poly) * 4))
+  ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+  ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#define poly_rej_uniform MLKEM_NAMESPACE(poly_rej_uniform)
+/*************************************************
+ * Name:        poly_rej_uniform
+ *
+ * Description: Generate polynomial using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - poly *vec:           Pointer to polynomial to be sampled.
+ *              - uint8_t *seed:       Pointer to seed buffer of size
+ *                                     MLKEM_SYMBYTES + 2 each.
+ *
+ **************************************************/
+MLKEM_NATIVE_INTERNAL_API
+void poly_rej_uniform(poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
+__contract__(
+  requires(memory_no_alias(entry, sizeof(poly)))
+  requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
+  assigns(memory_slice(entry, sizeof(poly)))
+  ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+
+#endif /* SAMPLING_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c
index 4ef887c62..987f0dce4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c
@@ -10,7 +10,7 @@
 
 #include "common.h"
 #if !defined(MLKEM_NATIVE_MULTILEVEL_BUILD_NO_SHARED)
-#include "ntt.h"
+#include "poly.h"
 
 /*
  * Table of zeta values used in the reference NTT and inverse NTT.
diff --git a/tests/constant_time/kem/passes/ml_kem b/tests/constant_time/kem/passes/ml_kem
index cc4f93e4e..34674562b 100644
--- a/tests/constant_time/kem/passes/ml_kem
+++ b/tests/constant_time/kem/passes/ml_kem
@@ -12,14 +12,14 @@
    fun:PQCP_MLKEM_NATIVE_MLKEM*_dec
 }
 {
-   <insert_a_suppression_name_here>
+   Rejection sampling to produce public "A" matrix
    Memcheck:Cond
    ...
    fun:PQCP_MLKEM_NATIVE_MLKEM*_gen_matrix
    fun:PQCP_MLKEM_NATIVE_MLKEM*_indcpa_*
 }
 {
-   <insert_a_suppression_name_here>
+   Rejection sampling to produce public "A" matrix
    Memcheck:Value8
    ...
    fun:PQCP_MLKEM_NATIVE_MLKEM*_gen_matrix