0xPolygonMiden · bobbinth · Oct 26, 2023 · Oct 24, 2023 · Oct 26, 2023
diff --git a/benches/README.md b/benches/README.md
@@ -6,6 +6,7 @@ In the Miden VM, we make use of different hash functions. Some of these are "tra
 * **Poseidon** as specified [here](https://eprint.iacr.org/2019/458.pdf) and implemented [here](https://github.com/mir-protocol/plonky2/blob/806b88d7d6e69a30dc0b4775f7ba275c45e8b63b/plonky2/src/hash/poseidon_goldilocks.rs) (but in pure Rust, without vectorized instructions).
 * **Rescue Prime (RP)** as specified [here](https://eprint.iacr.org/2020/1143) and implemented [here](https://github.com/novifinancial/winterfell/blob/46dce1adf0/crypto/src/hash/rescue/rp64_256/mod.rs).
 * **Rescue Prime Optimized (RPO)** as specified [here](https://eprint.iacr.org/2022/1577) and implemented in this crate.
+* **Rescue Prime Extended (RPX)** a variant of the [xHash](https://eprint.iacr.org/2023/1045) hash function as implemented in this crate.
 
 ## Comparison and Instructions
 
@@ -15,25 +16,25 @@ The second scenario is that of sequential hashing where we take a sequence of le
 
 #### Scenario 1: 2-to-1 hashing `h(a,b)`
 
-| Function            | BLAKE3 | SHA3    | Poseidon  | Rp64_256  | RPO_256 |
-| ------------------- | ------ | --------| --------- | --------- | ------- |
-| Apple M1 Pro        | 80 ns  | 245 ns  |  1.5 us   |  9.1 us   | 5.4 us  |
-| Apple M2            | 76 ns  | 233 ns  |  1.3 us   |  7.9 us   | 5.0 us  |
-| Amazon Graviton 3   | 108 ns |         |           |           | 5.3 us  |
-| AMD Ryzen 9 5950X   | 64 ns  | 273 ns  |  1.2 us   |  9.1 us   | 5.5 us  |
-| Intel Core i5-8279U | 80 ns  |         |           |           | 8.7 us  |
-| Intel Xeon 8375C    | 67 ns  |         |           |           | 8.2 us  |
+| Function            | BLAKE3 | SHA3    | Poseidon  | Rp64_256  | RPO_256 | RPX_256 |
+| ------------------- | ------ | ------- | --------- | --------- | ------- | ------- |
+| Apple M1 Pro        | 76 ns  | 245 ns  |  1.5 µs   |  9.1 µs   | 5.2 µs  | 2.7 µs  |
+| Apple M2 Max        | 71 ns  | 233 ns  |  1.3 µs   |  7.9 µs   | 4.6 µs  | 2.4 µs  |
+| Amazon Graviton 3   | 108 ns |         |           |           | 5.3 µs  | 3.1 µs  |
+| AMD Ryzen 9 5950X   | 64 ns  | 273 ns  |  1.2 µs   |  9.1 µs   | 5.5 µs  |         |
+| Intel Core i5-8279U | 68 ns  | 536 ns  |  2.0 µs   |  13.6 µs  | 8.5 µs  | 4.4 µs  |
+| Intel Xeon 8375C    | 67 ns  |         |           |           | 8.2 µs  |         |
 
 #### Scenario 2: Sequential hashing of 100 elements `h([a_0,...,a_99])`
 
-| Function            | BLAKE3 | SHA3    | Poseidon  | Rp64_256  | RPO_256 |
-| ------------------- | -------| ------- | --------- | --------- | ------- |
-| Apple M1 Pro        | 1.0 us | 1.5 us  |  19.4 us  |   118 us  | 70 us   |
-| Apple M2            | 1.0 us | 1.5 us  |  17.4 us  |   103 us  | 65 us   |
-| Amazon Graviton 3   | 1.4 us |         |           |           | 69 us   |
-| AMD Ryzen 9 5950X   | 0.8 us | 1.7 us  |  15.7 us  |   120 us  | 72 us   |
-| Intel Core i5-8279U | 1.0 us |         |           |           | 116 us  |
-| Intel Xeon 8375C    | 0.8 ns |         |           |           | 110 us  |
+| Function            | BLAKE3 | SHA3    | Poseidon  | Rp64_256  | RPO_256 | RPX_256 |
+| ------------------- | -------| ------- | --------- | --------- | ------- | ------- |
+| Apple M1 Pro        | 1.0 µs | 1.5 µs  |  19.4 µs  |   118 µs  | 69 µs   | 35 µs   |
+| Apple M2 Max        | 0.9 µs | 1.5 µs  |  17.4 µs  |   103 µs  | 60 µs   | 31 µs   |
+| Amazon Graviton 3   | 1.4 µs |         |           |           | 69 µs   | 41 µs   |
+| AMD Ryzen 9 5950X   | 0.8 µs | 1.7 µs  |  15.7 µs  |   120 µs  | 72 µs   |         |
+| Intel Core i5-8279U | 0.9 µs |         |           |           | 107 µs  | 56 µs   |
+| Intel Xeon 8375C    | 0.8 µs |         |           |           | 110 µs  |         |
 
 Notes:
 - On Graviton 3, RPO256 is run with SVE acceleration enabled.

diff --git a/benches/hash.rs b/benches/hash.rs
@@ -3,6 +3,7 @@ use miden_crypto::{
     hash::{
         blake::Blake3_256,
         rpo::{Rpo256, RpoDigest},
+        rpx::{Rpx256, RpxDigest},
     },
     Felt,
 };
@@ -57,6 +58,54 @@ fn rpo256_sequential(c: &mut Criterion) {
     });
 }
 
+fn rpx256_2to1(c: &mut Criterion) {
+    let v: [RpxDigest; 2] = [Rpx256::hash(&[1_u8]), Rpx256::hash(&[2_u8])];
+    c.bench_function("RPX256 2-to-1 hashing (cached)", |bench| {
+        bench.iter(|| Rpx256::merge(black_box(&v)))
+    });
+
+    c.bench_function("RPX256 2-to-1 hashing (random)", |bench| {
+        bench.iter_batched(
+            || {
+                [
+                    Rpx256::hash(&rand_value::<u64>().to_le_bytes()),
+                    Rpx256::hash(&rand_value::<u64>().to_le_bytes()),
+                ]
+            },
+            |state| Rpx256::merge(&state),
+            BatchSize::SmallInput,
+        )
+    });
+}
+
+fn rpx256_sequential(c: &mut Criterion) {
+    let v: [Felt; 100] = (0..100)
+        .into_iter()
+        .map(Felt::new)
+        .collect::<Vec<Felt>>()
+        .try_into()
+        .expect("should not fail");
+    c.bench_function("RPX256 sequential hashing (cached)", |bench| {
+        bench.iter(|| Rpx256::hash_elements(black_box(&v)))
+    });
+
+    c.bench_function("RPX256 sequential hashing (random)", |bench| {
+        bench.iter_batched(
+            || {
+                let v: [Felt; 100] = (0..100)
+                    .into_iter()
+                    .map(|_| Felt::new(rand_value()))
+                    .collect::<Vec<Felt>>()
+                    .try_into()
+                    .expect("should not fail");
+                v
+            },
+            |state| Rpx256::hash_elements(&state),
+            BatchSize::SmallInput,
+        )
+    });
+}
+
 fn blake3_2to1(c: &mut Criterion) {
     let v: [<Blake3_256 as Hasher>::Digest; 2] =
         [Blake3_256::hash(&[1_u8]), Blake3_256::hash(&[2_u8])];
@@ -106,5 +155,13 @@ fn blake3_sequential(c: &mut Criterion) {
     });
 }
 
-criterion_group!(hash_group, rpo256_2to1, rpo256_sequential, blake3_2to1, blake3_sequential);
+criterion_group!(
+    hash_group,
+    rpx256_2to1,
+    rpx256_sequential,
+    rpo256_2to1,
+    rpo256_sequential,
+    blake3_2to1,
+    blake3_sequential
+);
 criterion_main!(hash_group);
diff --git a/src/hash/mod.rs b/src/hash/mod.rs
@@ -1,9 +1,17 @@
 //! Cryptographic hash functions used by the Miden VM and the Miden rollup.
 
-use super::{Felt, FieldElement, StarkField, ONE, ZERO};
+use super::{CubeExtension, Felt, FieldElement, StarkField, ONE, ZERO};
 
 pub mod blake;
-pub mod rpo;
+
+mod rescue;
+pub mod rpo {
+    pub use super::rescue::{Rpo256, RpoDigest};
+}
+
+pub mod rpx {
+    pub use super::rescue::{Rpx256, RpxDigest};
+}
 
 // RE-EXPORTS
 // ================================================================================================

diff --git a/src/hash/rpo/mds_freq.rs → src/hash/rescue/mds/freq.rs b/src/hash/rpo/mds_freq.rs → src/hash/rescue/mds/freq.rs
@@ -11,7 +11,8 @@
 /// divisions by 2 and repeated modular reductions. This is because of our explicit choice of
 /// an MDS matrix that has small powers of 2 entries in frequency domain.
 /// The following implementation has benefited greatly from the discussions and insights of
-/// Hamish Ivey-Law and Jacqueline Nabaglo of Polygon Zero.
+/// Hamish Ivey-Law and Jacqueline Nabaglo of Polygon Zero and is base on Nabaglo's Plonky2
+/// implementation.
 
 // Rescue MDS matrix in frequency domain.
 // More precisely, this is the output of the three 4-point (real) FFTs of the first column of
@@ -26,7 +27,7 @@ const MDS_FREQ_BLOCK_THREE: [i64; 3] = [-8, 1, 1];
 
 // We use split 3 x 4 FFT transform in order to transform our vectors into the frequency domain.
 #[inline(always)]
-pub(crate) const fn mds_multiply_freq(state: [u64; 12]) -> [u64; 12] {
+pub const fn mds_multiply_freq(state: [u64; 12]) -> [u64; 12] {
     let [s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] = state;
 
     let (u0, u1, u2) = fft4_real([s0, s3, s6, s9]);
@@ -156,7 +157,7 @@ const fn block3(x: [i64; 3], y: [i64; 3]) -> [i64; 3] {
 
 #[cfg(test)]
 mod tests {
-    use super::super::{Felt, Rpo256, MDS, ZERO};
+    use super::super::{apply_mds, Felt, MDS, ZERO};
     use proptest::prelude::*;
 
     const STATE_WIDTH: usize = 12;
@@ -185,7 +186,7 @@ mod tests {
             v2 = v1;
 
             apply_mds_naive(&mut v1);
-            Rpo256::apply_mds(&mut v2);
+            apply_mds(&mut v2);
 
             prop_assert_eq!(v1, v2);
         }

diff --git a/src/hash/rescue/mds/mod.rs b/src/hash/rescue/mds/mod.rs
@@ -0,0 +1,214 @@
+use super::{Felt, STATE_WIDTH, ZERO};
+
+mod freq;
+pub use freq::mds_multiply_freq;
+
+// MDS MULTIPLICATION
+// ================================================================================================
+
+#[inline(always)]
+pub fn apply_mds(state: &mut [Felt; STATE_WIDTH]) {
+    let mut result = [ZERO; STATE_WIDTH];
+
+    // Using the linearity of the operations we can split the state into a low||high decomposition
+    // and operate on each with no overflow and then combine/reduce the result to a field element.
+    // The no overflow is guaranteed by the fact that the MDS matrix is a small powers of two in
+    // frequency domain.
+    let mut state_l = [0u64; STATE_WIDTH];
+    let mut state_h = [0u64; STATE_WIDTH];
+
+    for r in 0..STATE_WIDTH {
+        let s = state[r].inner();
+        state_h[r] = s >> 32;
+        state_l[r] = (s as u32) as u64;
+    }
+
+    let state_h = mds_multiply_freq(state_h);
+    let state_l = mds_multiply_freq(state_l);
+
+    for r in 0..STATE_WIDTH {
+        let s = state_l[r] as u128 + ((state_h[r] as u128) << 32);
+        let s_hi = (s >> 64) as u64;
+        let s_lo = s as u64;
+        let z = (s_hi << 32) - s_hi;
+        let (res, over) = s_lo.overflowing_add(z);
+
+        result[r] = Felt::from_mont(res.wrapping_add(0u32.wrapping_sub(over as u32) as u64));
+    }
+    *state = result;
+}
+
+// MDS MATRIX
+// ================================================================================================
+
+/// RPO MDS matrix
+pub const MDS: [[Felt; STATE_WIDTH]; STATE_WIDTH] = [
+    [
+        Felt::new(7),
+        Felt::new(23),
+        Felt::new(8),
+        Felt::new(26),
+        Felt::new(13),
+        Felt::new(10),
+        Felt::new(9),
+        Felt::new(7),
+        Felt::new(6),
+        Felt::new(22),
+        Felt::new(21),
+        Felt::new(8),
+    ],
+    [
+        Felt::new(8),
+        Felt::new(7),
+        Felt::new(23),
+        Felt::new(8),
+        Felt::new(26),
+        Felt::new(13),
+        Felt::new(10),
+        Felt::new(9),
+        Felt::new(7),
+        Felt::new(6),
+        Felt::new(22),
+        Felt::new(21),
+    ],
+    [
+        Felt::new(21),
+        Felt::new(8),
+        Felt::new(7),
+        Felt::new(23),
+        Felt::new(8),
+        Felt::new(26),
+        Felt::new(13),
+        Felt::new(10),
+        Felt::new(9),
+        Felt::new(7),
+        Felt::new(6),
+        Felt::new(22),
+    ],
+    [
+        Felt::new(22),
+        Felt::new(21),
+        Felt::new(8),
+        Felt::new(7),
+        Felt::new(23),
+        Felt::new(8),
+        Felt::new(26),
+        Felt::new(13),
+        Felt::new(10),
+        Felt::new(9),
+        Felt::new(7),
+        Felt::new(6),
+    ],
+    [
+        Felt::new(6),
+        Felt::new(22),
+        Felt::new(21),
+        Felt::new(8),
+        Felt::new(7),
+        Felt::new(23),
+        Felt::new(8),
+        Felt::new(26),
+        Felt::new(13),
+        Felt::new(10),
+        Felt::new(9),
+        Felt::new(7),
+    ],
+    [
+        Felt::new(7),
+        Felt::new(6),
+        Felt::new(22),
+        Felt::new(21),
+        Felt::new(8),
+        Felt::new(7),
+        Felt::new(23),
+        Felt::new(8),
+        Felt::new(26),
+        Felt::new(13),
+        Felt::new(10),
+        Felt::new(9),
+    ],
+    [
+        Felt::new(9),
+        Felt::new(7),
+        Felt::new(6),
+        Felt::new(22),
+        Felt::new(21),
+        Felt::new(8),
+        Felt::new(7),
+        Felt::new(23),
+        Felt::new(8),
+        Felt::new(26),
+        Felt::new(13),
+        Felt::new(10),
+    ],
+    [
+        Felt::new(10),
+        Felt::new(9),
+        Felt::new(7),
+        Felt::new(6),
+        Felt::new(22),
+        Felt::new(21),
+        Felt::new(8),
+        Felt::new(7),
+        Felt::new(23),
+        Felt::new(8),
+        Felt::new(26),
+        Felt::new(13),
+    ],
+    [
+        Felt::new(13),
+        Felt::new(10),
+        Felt::new(9),
+        Felt::new(7),
+        Felt::new(6),
+        Felt::new(22),
+        Felt::new(21),
+        Felt::new(8),
+        Felt::new(7),
+        Felt::new(23),
+        Felt::new(8),
+        Felt::new(26),
+    ],
+    [
+        Felt::new(26),
+        Felt::new(13),
+        Felt::new(10),
+        Felt::new(9),
+        Felt::new(7),
+        Felt::new(6),
+        Felt::new(22),
+        Felt::new(21),
+        Felt::new(8),
+        Felt::new(7),
+        Felt::new(23),
+        Felt::new(8),
+    ],
+    [
+        Felt::new(8),
+        Felt::new(26),
+        Felt::new(13),
+        Felt::new(10),
+        Felt::new(9),
+        Felt::new(7),
+        Felt::new(6),
+        Felt::new(22),
+        Felt::new(21),
+        Felt::new(8),
+        Felt::new(7),
+        Felt::new(23),
+    ],
+    [
+        Felt::new(23),
+        Felt::new(8),
+        Felt::new(26),
+        Felt::new(13),
+        Felt::new(10),
+        Felt::new(9),
+        Felt::new(7),
+        Felt::new(6),
+        Felt::new(22),
+        Felt::new(21),
+        Felt::new(8),
+        Felt::new(7),
+    ],
+];