o1-labs
diff --git a/‎kimchi/benches/proof_criterion.rs
+28-9 b/‎kimchi/benches/proof_criterion.rs
+28-9
diff --git a/‎kimchi/src/bench.rs
+7-8 b/‎kimchi/src/bench.rs
+7-8
diff --git a/‎kimchi/src/circuits/constraints.rs
+45-16 b/‎kimchi/src/circuits/constraints.rs
+45-16
diff --git a/‎kimchi/src/circuits/polynomials/permutation.rs
+77-32 b/‎kimchi/src/circuits/polynomials/permutation.rs
+77-32
@@ -1,11 +1,13 @@
+#![allow(clippy::unit_arg)]
 use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode};
 use kimchi::bench::BenchmarkCtx;
 
 pub fn bench_proof_creation(c: &mut Criterion) {
     let mut group = c.benchmark_group("Proof creation");
-    group.sample_size(10).sampling_mode(SamplingMode::Flat); // for slow benchmarks
+    group.sampling_mode(SamplingMode::Flat); // for slow benchmarks
+    group.measurement_time(std::time::Duration::from_secs(90));
 
-    for size in [10, 14] {
+    for size in [10, 15, 16] {
         let ctx = BenchmarkCtx::new(size);
 
         group.bench_function(
@@ -21,19 +23,36 @@ pub fn bench_proof_creation(c: &mut Criterion) {
 
 pub fn bench_proof_verification(c: &mut Criterion) {
     let mut group = c.benchmark_group("Proof verification");
-    group.sample_size(100).sampling_mode(SamplingMode::Auto);
 
-    for size in [10, 14] {
-        let ctx = BenchmarkCtx::new(size);
-        let proof_and_public = ctx.create_proof();
+    // Unfortunately, we have to use relatively big sample sizes. With this
+    // the noise should be <0.5%
+    group.sampling_mode(SamplingMode::Linear);
+    group.measurement_time(std::time::Duration::from_secs(300));
+
+    for n_gates_log in [10, 15, 16] {
+        // averaging over several proofs and contexts, since using
+        // just one seems to introduce extra variance.
+        let inputs: Vec<_> = (0..20)
+            .map(|_| {
+                let ctx = BenchmarkCtx::new(n_gates_log);
+                let proof = ctx.create_proof();
+                (ctx, proof)
+            })
+            .collect();
 
         group.bench_function(
             format!(
                 "proof verification (SRS size 2^{{{}}}, {} gates)",
-                ctx.srs_size(),
-                ctx.num_gates
+                inputs[0].0.srs_size(),
+                1 << n_gates_log
             ),
-            |b| b.iter(|| ctx.batch_verification(black_box(&vec![proof_and_public.clone()]))),
+            |b| {
+                b.iter_batched(
+                    || &inputs[rand::random::<usize>() % inputs.len()],
+                    |(ctx, proof)| black_box(ctx.batch_verification(std::slice::from_ref(proof))),
+                    criterion::BatchSize::LargeInput,
+                )
+            },
         );
     }
 }
 
@@ -7,7 +7,7 @@ use mina_poseidon::{
     sponge::{DefaultFqSponge, DefaultFrSponge},
 };
 use o1_utils::math;
-use poly_commitment::{commitment::CommitmentCurve, ipa::OpeningProof, SRS as _};
+use poly_commitment::{commitment::CommitmentCurve, ipa::OpeningProof, SRS};
 
 use crate::{
     circuits::{
@@ -18,7 +18,6 @@ use crate::{
     proof::ProverProof,
     prover_index::{testing::new_index_for_test, ProverIndex},
     verifier::{batch_verify, Context},
-    verifier_index::VerifierIndex,
 };
 
 type SpongeParams = PlonkSpongeConstantsKimchi;
@@ -29,7 +28,6 @@ pub struct BenchmarkCtx {
     pub num_gates: usize,
     group_map: BWParameters<VestaParameters>,
     index: ProverIndex<Vesta, OpeningProof<Vesta>>,
-    verifier_index: VerifierIndex<Vesta, OpeningProof<Vesta>>,
 }
 
 impl BenchmarkCtx {
@@ -61,19 +59,20 @@ impl BenchmarkCtx {
         let group_map = <Vesta as CommitmentCurve>::Map::setup();
 
         // create the index
-        let index = new_index_for_test(gates, 0);
+        let mut index = new_index_for_test(gates, 0);
 
         assert_eq!(index.cs.domain.d1.log_size_of_group, srs_size_log2, "the test wanted to use an SRS of size {srs_size_log2} but the domain size ended up being {}", index.cs.domain.d1.log_size_of_group);
 
         // create the verifier index
-        let verifier_index = index.verifier_index();
+        index.compute_verifier_index_digest::<BaseSponge>();
+
+        // just in case check that lagrange bases are generated
+        index.srs.get_lagrange_basis(index.cs.domain.d1);
 
-        //
         BenchmarkCtx {
             num_gates,
             group_map,
             index,
-            verifier_index,
         }
     }
 
@@ -104,7 +103,7 @@ impl BenchmarkCtx {
         let batch: Vec<_> = batch
             .iter()
             .map(|(proof, public)| Context {
-                verifier_index: &self.verifier_index,
+                verifier_index: self.index.verifier_index.as_ref().unwrap(),
                 proof,
                 public_input: public,
             })
 
@@ -26,6 +26,7 @@ use ark_poly::{
 use o1_utils::ExtendedEvaluations;
 use once_cell::sync::OnceCell;
 use poly_commitment::OpenProof;
+use rayon::prelude::*;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
 use serde_with::serde_as;
 use std::{array, default::Default, sync::Arc};
@@ -373,25 +374,53 @@ impl<F: PrimeField, G: KimchiCurve<ScalarField = F>, OpeningProof: OpenProof<G>>
 impl<F: PrimeField> ConstraintSystem<F> {
     /// evaluate witness polynomials over domains
     pub fn evaluate(&self, w: &[DP<F>; COLUMNS], z: &DP<F>) -> WitnessOverDomains<F> {
-        // compute shifted witness polynomials
-        let w8: [E<F, D<F>>; COLUMNS] =
-            array::from_fn(|i| w[i].evaluate_over_domain_by_ref(self.domain.d8));
-        let z8 = z.evaluate_over_domain_by_ref(self.domain.d8);
-
-        let w4: [E<F, D<F>>; COLUMNS] = array::from_fn(|i| {
-            E::<F, D<F>>::from_vec_and_domain(
-                (0..self.domain.d4.size)
-                    .map(|j| w8[i].evals[2 * j as usize])
-                    .collect(),
-                self.domain.d4,
-            )
-        });
+        // compute shifted witness polynomials and z8, all in parallel
+        let (w8, z8): ([E<F, D<F>>; COLUMNS], _) = {
+            let mut res = w
+                .par_iter()
+                .chain(rayon::iter::once(z))
+                .map(|elem| elem.evaluate_over_domain_by_ref(self.domain.d8))
+                .collect::<Vec<_>>();
+            let z8 = res[COLUMNS].clone();
+            res.truncate(COLUMNS);
+            (res.try_into().unwrap(), z8)
+        };
+
+        let w4: [E<F, D<F>>; COLUMNS] = (0..COLUMNS)
+            .into_par_iter()
+            .map(|i| {
+                E::<F, D<F>>::from_vec_and_domain(
+                    (0..self.domain.d4.size)
+                        .map(|j| w8[i].evals[2 * j as usize])
+                        .collect(),
+                    self.domain.d4,
+                )
+            })
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
+
         let z4 = DP::<F>::zero().evaluate_over_domain_by_ref(D::<F>::new(1).unwrap());
+        let z8_shift8 = z8.shift(8);
+
+        let d4_next_w: [_; COLUMNS] = w4
+            .par_iter()
+            .map(|w4_i| w4_i.shift(4))
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
+
+        let d8_next_w: [_; COLUMNS] = w8
+            .par_iter()
+            .map(|w8_i| w8_i.shift(8))
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
 
         WitnessOverDomains {
             d4: WitnessShifts {
                 next: WitnessEvals {
-                    w: array::from_fn(|i| w4[i].shift(4)),
+                    w: d4_next_w,
                     // TODO(mimoo): change z to an Option? Or maybe not, we might actually need this dummy evaluation in the aggregated evaluation proof
                     z: z4.clone(), // dummy evaluation
                 },
@@ -402,8 +431,8 @@ impl<F: PrimeField> ConstraintSystem<F> {
             },
             d8: WitnessShifts {
                 next: WitnessEvals {
-                    w: array::from_fn(|i| w8[i].shift(8)),
-                    z: z8.shift(8),
+                    w: d8_next_w,
+                    z: z8_shift8,
                 },
                 this: WitnessEvals { w: w8, z: z8 },
             },
 
@@ -257,32 +257,45 @@ impl<F: PrimeField, G: KimchiCurve<ScalarField = F>, OpeningProof: OpenProof<G>>
             // (w[1](x) + gamma + x * beta * shift[1]) * ...
             // (w[6](x) + gamma + x * beta * shift[6])
             // in evaluation form in d8
-            let mut shifts = lagrange.d8.this.z.clone();
-            for (witness, shift) in lagrange.d8.this.w.iter().zip(self.cs.shift.iter()) {
-                let term =
-                    &(witness + gamma) + &self.cs.precomputations().poly_x_d1.scale(beta * shift);
-                shifts = &shifts * &term;
-            }
+            let shifts: Evaluations<F, D<F>> = &lagrange
+                .d8
+                .this
+                .w
+                .par_iter()
+                .zip(self.cs.shift.par_iter())
+                .map(|(witness, shift)| {
+                    &(witness + gamma) + &self.cs.precomputations().poly_x_d1.scale(beta * shift)
+                })
+                .reduce_with(|mut l, r| {
+                    l *= &r;
+                    l
+                })
+                .unwrap()
+                * &lagrange.d8.this.z.clone();
 
             // sigmas = z(x * w) *
             // (w8[0] + gamma + sigma[0] * beta) *
             // (w8[1] + gamma + sigma[1] * beta) * ...
             // (w8[6] + gamma + sigma[6] * beta)
             // in evaluation form in d8
-            let mut sigmas = lagrange.d8.next.z.clone();
-            for (witness, sigma) in lagrange
+            let sigmas = &lagrange
                 .d8
                 .this
                 .w
-                .iter()
-                .zip(self.column_evaluations.permutation_coefficients8.iter())
-            {
-                let term = witness + &(gamma + &sigma.scale(beta));
-                sigmas = &sigmas * &term;
-            }
-
-            &(&shifts - &sigmas).scale(alpha0)
-                * &self.cs.precomputations().permutation_vanishing_polynomial_l
+                .par_iter()
+                .zip(self.column_evaluations.permutation_coefficients8.par_iter())
+                .map(|(witness, sigma)| witness + &(gamma + &sigma.scale(beta)))
+                .reduce_with(|mut l, r| {
+                    l *= &r;
+                    l
+                })
+                .unwrap()
+                * &lagrange.d8.next.z.clone();
+
+            let res = &(&shifts - &sigmas).scale(alpha0)
+                * &self.cs.precomputations().permutation_vanishing_polynomial_l;
+
+            res
         };
 
         //~ and `bnd`:
@@ -324,7 +337,6 @@ impl<F: PrimeField, G: KimchiCurve<ScalarField = F>, OpeningProof: OpenProof<G>>
 
             &bnd1.scale(alpha1) + &bnd2.scale(alpha2)
         };
-
         Ok((perm, bnd))
     }
 
@@ -433,8 +445,6 @@ impl<F: PrimeField, G: KimchiCurve<ScalarField = F>, OpeningProof: OpenProof<G>>
         //~ The first evaluation represents the initial value of the accumulator:
         //~ $$z(g^0) = 1$$
 
-        let mut z = vec![F::one(); n];
-
         //~ For $i = 0, \cdot, n - 4$, where $n$ is the size of the domain,
         //~ evaluations are computed as:
         //~
@@ -468,27 +478,61 @@ impl<F: PrimeField, G: KimchiCurve<ScalarField = F>, OpeningProof: OpenProof<G>>
         //~ \end{align}
         //~ $$
         //~
-        for j in 0..n - 1 {
-            z[j + 1] = witness
-                .iter()
-                .zip(self.column_evaluations.permutation_coefficients8.iter())
-                .map(|(w, s)| w[j] + (s[8 * j] * beta) + gamma)
-                .fold(F::one(), |x, y| x * y);
-        }
+
+        // We compute z such that:
+        // z[0] = 1
+        // z[j+1] = \Prod_{i=0}^{PERMUTS}(wit[i][j] + (s[i][8*j] * beta) + gamma)     for j ∈ 0..n-1
+        //
+        // We compute every product batch separately first (one batch
+        // per i∈[COLUMNS]), and then multiply all batches together.
+        //
+        // Note that we zip array of COLUMNS with array of PERMUTS;
+        // Since PERMUTS < COLUMNS, that's what's actually used.
+        let mut z: Vec<F> = witness
+            .par_iter()
+            .zip(self.column_evaluations.permutation_coefficients8.par_iter())
+            .map(|(w_i, perm_coeffs8_i)| {
+                let mut output_vec: Vec<_> = vec![F::one(); 1];
+                for (j, w_i_j) in w_i.iter().enumerate().take(n - 1) {
+                    output_vec.push(*w_i_j + (perm_coeffs8_i[8 * j] * beta) + gamma);
+                }
+                output_vec
+            })
+            .reduce_with(|mut l, r| {
+                for i in 0..n - 1 {
+                    l[i] *= &r[i];
+                }
+                l
+            })
+            .unwrap();
 
         ark_ff::fields::batch_inversion::<F>(&mut z[1..n]);
 
+        let z_prefolded: Vec<F> = witness
+            .par_iter()
+            .zip(self.cs.shift.par_iter())
+            .map(|(w_i, shift_i)| {
+                let mut output_vec: Vec<_> = vec![F::one(); 1];
+                for (j, w_i_j) in w_i.iter().enumerate().take(n - 1) {
+                    output_vec.push(*w_i_j + (self.cs.sid[j] * beta * shift_i) + gamma);
+                }
+                output_vec
+            })
+            .reduce_with(|mut l, r| {
+                for i in 0..n - 1 {
+                    l[i] *= &r[i];
+                }
+                l
+            })
+            .unwrap();
+
         //~ We randomize the evaluations at `n - zk_rows + 1` and `n - zk_rows + 2` in order to add
         //~ zero-knowledge to the protocol.
         //~
         for j in 0..n - 1 {
             if j != n - zk_rows && j != n - zk_rows + 1 {
                 let x = z[j];
-                z[j + 1] *= witness
-                    .iter()
-                    .zip(self.cs.shift.iter())
-                    .map(|(w, s)| w[j] + (self.cs.sid[j] * beta * s) + gamma)
-                    .fold(x, |z, y| z * y);
+                z[j + 1] *= z_prefolded[j + 1] * x;
             } else {
                 z[j + 1] = F::rand(rng);
             }
@@ -501,6 +545,7 @@ impl<F: PrimeField, G: KimchiCurve<ScalarField = F>, OpeningProof: OpenProof<G>>
         };
 
         let res = Evaluations::<F, D<F>>::from_vec_and_domain(z, self.cs.domain.d1).interpolate();
+
         Ok(res)
     }
 }