feat: karatsuba fp6 mul (#20)

argumentcomputer · Aug 21, 2024 · 652ebc1 · 652ebc1
1 parent 8b816fe
commit 652ebc1
Showing 1 changed file with 106 additions and 79 deletions.
diff --git a/src/fp6.rs b/src/fp6.rs
@@ -265,96 +265,123 @@ impl Fp6 {
     }
 
     /// Returns `c = self * b`.
-    ///
-    /// Implements the full-tower interleaving strategy from
-    /// [ePrint 2022-376](https://eprint.iacr.org/2022/367).
     #[inline]
     fn mul_interleaved(&self, b: &Self) -> Self {
-        // The intuition for this algorithm is that we can look at F_p^6 as a direct
-        // extension of F_p^2, and express the overall operations down to the base field
-        // F_p instead of only over F_p^2. This enables us to interleave multiplications
-        // and reductions, ensuring that we don't require double-width intermediate
-        // representations (with around twice as many limbs as F_p elements).
-
-        // We want to express the multiplication c = a x b, where a = (a_0, a_1, a_2) is
-        // an element of F_p^6, and a_i = (a_i,0, a_i,1) is an element of F_p^2. The fully
-        // expanded multiplication is given by (2022-376 §5):
-        //
-        //   c_0,0 = a_0,0 b_0,0 - a_0,1 b_0,1 + a_1,0 b_2,0 - a_1,1 b_2,1 + a_2,0 b_1,0 - a_2,1 b_1,1
-        //                                     - a_1,0 b_2,1 - a_1,1 b_2,0 - a_2,0 b_1,1 - a_2,1 b_1,0.
-        //         = a_0,0 b_0,0 - a_0,1 b_0,1 + a_1,0 (b_2,0 - b_2,1) - a_1,1 (b_2,0 + b_2,1)
-        //                                     + a_2,0 (b_1,0 - b_1,1) - a_2,1 (b_1,0 + b_1,1).
-        //
-        //   c_0,1 = a_0,0 b_0,1 + a_0,1 b_0,0 + a_1,0 b_2,1 + a_1,1 b_2,0 + a_2,0 b_1,1 + a_2,1 b_1,0
-        //                                     + a_1,0 b_2,0 - a_1,1 b_2,1 + a_2,0 b_1,0 - a_2,1 b_1,1.
-        //         = a_0,0 b_0,1 + a_0,1 b_0,0 + a_1,0(b_2,0 + b_2,1) + a_1,1(b_2,0 - b_2,1)
-        //                                     + a_2,0(b_1,0 + b_1,1) + a_2,1(b_1,0 - b_1,1).
-        //
-        //   c_1,0 = a_0,0 b_1,0 - a_0,1 b_1,1 + a_1,0 b_0,0 - a_1,1 b_0,1 + a_2,0 b_2,0 - a_2,1 b_2,1
-        //                                                                 - a_2,0 b_2,1 - a_2,1 b_2,0.
-        //         = a_0,0 b_1,0 - a_0,1 b_1,1 + a_1,0 b_0,0 - a_1,1 b_0,1 + a_2,0(b_2,0 - b_2,1)
-        //                                                                 - a_2,1(b_2,0 + b_2,1).
-        //
-        //   c_1,1 = a_0,0 b_1,1 + a_0,1 b_1,0 + a_1,0 b_0,1 + a_1,1 b_0,0 + a_2,0 b_2,1 + a_2,1 b_2,0
-        //                                                                 + a_2,0 b_2,0 - a_2,1 b_2,1
-        //         = a_0,0 b_1,1 + a_0,1 b_1,0 + a_1,0 b_0,1 + a_1,1 b_0,0 + a_2,0(b_2,0 + b_2,1)
-        //                                                                 + a_2,1(b_2,0 - b_2,1).
-        //
-        //   c_2,0 = a_0,0 b_2,0 - a_0,1 b_2,1 + a_1,0 b_1,0 - a_1,1 b_1,1 + a_2,0 b_0,0 - a_2,1 b_0,1.
-        //   c_2,1 = a_0,0 b_2,1 + a_0,1 b_2,0 + a_1,0 b_1,1 + a_1,1 b_1,0 + a_2,0 b_0,1 + a_2,1 b_0,0.
-        //
-        // Each of these is a "sum of products", which we can compute efficiently.
-
         let a = self;
         cfg_if::cfg_if! {
             if #[cfg(target_os = "zkvm")] {
-                let mut b10_p_b11 = b.c1.c0;
-                b10_p_b11.add_inp(&b.c1.c1);
-                let mut b10_m_b11 = b.c1.c0;
-                b10_m_b11.sub_inp(&b.c1.c1);
-                let mut b20_p_b21 = b.c2.c0;
-                b20_p_b21.add_inp(&b.c2.c1);
-                let mut b20_m_b21 = b.c2.c0;
-                b20_m_b21.sub_inp(&b.c2.c1)
+                // Implements Algorithm 13 from https://eprint.iacr.org/2010/354.pdf
+                let mut t0 = self.c0;
+                t0.mul_inp(&b.c0);
+                let mut t1 = self.c1;
+                t1.mul_inp(&b.c1);
+                let mut t2 = self.c2;
+                t2.mul_inp(&b.c2);
+                let mut c0 = self.c1;
+                c0.add_inp(&self.c2);
+                let mut tmp = b.c1;
+                tmp.add_inp(&b.c2);
+                c0.mul_inp(&tmp);
+                tmp = t2;
+                tmp.add_inp(&t1);
+                c0.sub_inp(&tmp);
+                c0 = c0.mul_by_nonresidue();
+                c0.add_inp(&t0);
+                let mut c1 = self.c0;
+                c1.add_inp(&self.c1);
+                tmp = b.c0;
+                tmp.add_inp(&b.c1);
+                c1.mul_inp(&tmp);
+                tmp = t0;
+                tmp.add_inp(&t1);
+                c1.sub_inp(&tmp);
+                tmp = t2.mul_by_nonresidue();
+                c1.add_inp(&tmp);
+                tmp = self.c0;
+                tmp.add_inp(&self.c2);
+                let mut c2 = b.c0;
+                c2.add_inp(&b.c2);
+                c2.mul_inp(&tmp);
+                tmp = t0;
+                tmp.add_inp(&t2);
+                c2.sub_inp(&tmp);
+                c2.add_inp(&t1);
+                Fp6 { c0, c1, c2 }
             } else {
+                // Implements the full-tower interleaving strategy from
+                // [ePrint 2022-376](https://eprint.iacr.org/2022/367).
+                // The intuition for this algorithm is that we can look at F_p^6 as a direct
+                // extension of F_p^2, and express the overall operations down to the base field
+                // F_p instead of only over F_p^2. This enables us to interleave multiplications
+                // and reductions, ensuring that we don't require double-width intermediate
+                // representations (with around twice as many limbs as F_p elements).
+
+                // We want to express the multiplication c = a x b, where a = (a_0, a_1, a_2) is
+                // an element of F_p^6, and a_i = (a_i,0, a_i,1) is an element of F_p^2. The fully
+                // expanded multiplication is given by (2022-376 §5):
+                //
+                //   c_0,0 = a_0,0 b_0,0 - a_0,1 b_0,1 + a_1,0 b_2,0 - a_1,1 b_2,1 + a_2,0 b_1,0 - a_2,1 b_1,1
+                //                                     - a_1,0 b_2,1 - a_1,1 b_2,0 - a_2,0 b_1,1 - a_2,1 b_1,0.
+                //         = a_0,0 b_0,0 - a_0,1 b_0,1 + a_1,0 (b_2,0 - b_2,1) - a_1,1 (b_2,0 + b_2,1)
+                //                                     + a_2,0 (b_1,0 - b_1,1) - a_2,1 (b_1,0 + b_1,1).
+                //
+                //   c_0,1 = a_0,0 b_0,1 + a_0,1 b_0,0 + a_1,0 b_2,1 + a_1,1 b_2,0 + a_2,0 b_1,1 + a_2,1 b_1,0
+                //                                     + a_1,0 b_2,0 - a_1,1 b_2,1 + a_2,0 b_1,0 - a_2,1 b_1,1.
+                //         = a_0,0 b_0,1 + a_0,1 b_0,0 + a_1,0(b_2,0 + b_2,1) + a_1,1(b_2,0 - b_2,1)
+                //                                     + a_2,0(b_1,0 + b_1,1) + a_2,1(b_1,0 - b_1,1).
+                //
+                //   c_1,0 = a_0,0 b_1,0 - a_0,1 b_1,1 + a_1,0 b_0,0 - a_1,1 b_0,1 + a_2,0 b_2,0 - a_2,1 b_2,1
+                //                                                                 - a_2,0 b_2,1 - a_2,1 b_2,0.
+                //         = a_0,0 b_1,0 - a_0,1 b_1,1 + a_1,0 b_0,0 - a_1,1 b_0,1 + a_2,0(b_2,0 - b_2,1)
+                //                                                                 - a_2,1(b_2,0 + b_2,1).
+                //
+                //   c_1,1 = a_0,0 b_1,1 + a_0,1 b_1,0 + a_1,0 b_0,1 + a_1,1 b_0,0 + a_2,0 b_2,1 + a_2,1 b_2,0
+                //                                                                 + a_2,0 b_2,0 - a_2,1 b_2,1
+                //         = a_0,0 b_1,1 + a_0,1 b_1,0 + a_1,0 b_0,1 + a_1,1 b_0,0 + a_2,0(b_2,0 + b_2,1)
+                //                                                                 + a_2,1(b_2,0 - b_2,1).
+                //
+                //   c_2,0 = a_0,0 b_2,0 - a_0,1 b_2,1 + a_1,0 b_1,0 - a_1,1 b_1,1 + a_2,0 b_0,0 - a_2,1 b_0,1.
+                //   c_2,1 = a_0,0 b_2,1 + a_0,1 b_2,0 + a_1,0 b_1,1 + a_1,1 b_1,0 + a_2,0 b_0,1 + a_2,1 b_0,0.
+                //
+                // Each of these is a "sum of products", which we can compute efficiently.
                 let b10_p_b11 = b.c1.c0 + b.c1.c1;
                 let b10_m_b11 = b.c1.c0 - b.c1.c1;
                 let b20_p_b21 = b.c2.c0 + b.c2.c1;
                 let b20_m_b21 = b.c2.c0 - b.c2.c1;
-            }
-        }
 
-        Fp6 {
-            c0: Fp2 {
-                c0: Fp::sum_of_products(
-                    [a.c0.c0, -a.c0.c1, a.c1.c0, -a.c1.c1, a.c2.c0, -a.c2.c1],
-                    [b.c0.c0, b.c0.c1, b20_m_b21, b20_p_b21, b10_m_b11, b10_p_b11],
-                ),
-                c1: Fp::sum_of_products(
-                    [a.c0.c0, a.c0.c1, a.c1.c0, a.c1.c1, a.c2.c0, a.c2.c1],
-                    [b.c0.c1, b.c0.c0, b20_p_b21, b20_m_b21, b10_p_b11, b10_m_b11],
-                ),
-            },
-            c1: Fp2 {
-                c0: Fp::sum_of_products(
-                    [a.c0.c0, -a.c0.c1, a.c1.c0, -a.c1.c1, a.c2.c0, -a.c2.c1],
-                    [b.c1.c0, b.c1.c1, b.c0.c0, b.c0.c1, b20_m_b21, b20_p_b21],
-                ),
-                c1: Fp::sum_of_products(
-                    [a.c0.c0, a.c0.c1, a.c1.c0, a.c1.c1, a.c2.c0, a.c2.c1],
-                    [b.c1.c1, b.c1.c0, b.c0.c1, b.c0.c0, b20_p_b21, b20_m_b21],
-                ),
-            },
-            c2: Fp2 {
-                c0: Fp::sum_of_products(
-                    [a.c0.c0, -a.c0.c1, a.c1.c0, -a.c1.c1, a.c2.c0, -a.c2.c1],
-                    [b.c2.c0, b.c2.c1, b.c1.c0, b.c1.c1, b.c0.c0, b.c0.c1],
-                ),
-                c1: Fp::sum_of_products(
-                    [a.c0.c0, a.c0.c1, a.c1.c0, a.c1.c1, a.c2.c0, a.c2.c1],
-                    [b.c2.c1, b.c2.c0, b.c1.c1, b.c1.c0, b.c0.c1, b.c0.c0],
-                ),
-            },
+                Fp6 {
+                    c0: Fp2 {
+                        c0: Fp::sum_of_products(
+                            [a.c0.c0, -a.c0.c1, a.c1.c0, -a.c1.c1, a.c2.c0, -a.c2.c1],
+                            [b.c0.c0, b.c0.c1, b20_m_b21, b20_p_b21, b10_m_b11, b10_p_b11],
+                        ),
+                        c1: Fp::sum_of_products(
+                            [a.c0.c0, a.c0.c1, a.c1.c0, a.c1.c1, a.c2.c0, a.c2.c1],
+                            [b.c0.c1, b.c0.c0, b20_p_b21, b20_m_b21, b10_p_b11, b10_m_b11],
+                        ),
+                    },
+                    c1: Fp2 {
+                        c0: Fp::sum_of_products(
+                            [a.c0.c0, -a.c0.c1, a.c1.c0, -a.c1.c1, a.c2.c0, -a.c2.c1],
+                            [b.c1.c0, b.c1.c1, b.c0.c0, b.c0.c1, b20_m_b21, b20_p_b21],
+                        ),
+                        c1: Fp::sum_of_products(
+                            [a.c0.c0, a.c0.c1, a.c1.c0, a.c1.c1, a.c2.c0, a.c2.c1],
+                            [b.c1.c1, b.c1.c0, b.c0.c1, b.c0.c0, b20_p_b21, b20_m_b21],
+                        ),
+                    },
+                    c2: Fp2 {
+                        c0: Fp::sum_of_products(
+                            [a.c0.c0, -a.c0.c1, a.c1.c0, -a.c1.c1, a.c2.c0, -a.c2.c1],
+                            [b.c2.c0, b.c2.c1, b.c1.c0, b.c1.c1, b.c0.c0, b.c0.c1],
+                        ),
+                        c1: Fp::sum_of_products(
+                            [a.c0.c0, a.c0.c1, a.c1.c0, a.c1.c1, a.c2.c0, a.c2.c1],
+                            [b.c2.c1, b.c2.c0, b.c1.c1, b.c1.c0, b.c0.c1, b.c0.c0],
+                        ),
+                    },
+                }
+            }
         }
     }