Skip to content

Commit 3222167

Browse files
authored
Merge pull request #2969 from o1-labs/volhovm/profiling-kimchi-thread-utilisation
Performance: parallelising kimchi prover & verifier
2 parents 59ad3a7 + 0cb4118 commit 3222167

File tree

14 files changed

+594
-132
lines changed

14 files changed

+594
-132
lines changed

kimchi/benches/proof_criterion.rs

+28-9
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1+
#![allow(clippy::unit_arg)]
12
use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode};
23
use kimchi::bench::BenchmarkCtx;
34

45
pub fn bench_proof_creation(c: &mut Criterion) {
56
let mut group = c.benchmark_group("Proof creation");
6-
group.sample_size(10).sampling_mode(SamplingMode::Flat); // for slow benchmarks
7+
group.sampling_mode(SamplingMode::Flat); // for slow benchmarks
8+
group.measurement_time(std::time::Duration::from_secs(90));
79

8-
for size in [10, 14] {
10+
for size in [10, 15, 16] {
911
let ctx = BenchmarkCtx::new(size);
1012

1113
group.bench_function(
@@ -21,19 +23,36 @@ pub fn bench_proof_creation(c: &mut Criterion) {
2123

2224
pub fn bench_proof_verification(c: &mut Criterion) {
2325
let mut group = c.benchmark_group("Proof verification");
24-
group.sample_size(100).sampling_mode(SamplingMode::Auto);
2526

26-
for size in [10, 14] {
27-
let ctx = BenchmarkCtx::new(size);
28-
let proof_and_public = ctx.create_proof();
27+
// Unfortunately, we have to use relatively big sample sizes. With this
28+
// the noise should be <0.5%
29+
group.sampling_mode(SamplingMode::Linear);
30+
group.measurement_time(std::time::Duration::from_secs(300));
31+
32+
for n_gates_log in [10, 15, 16] {
33+
// averaging over several proofs and contexts, since using
34+
// just one seems to introduce extra variance.
35+
let inputs: Vec<_> = (0..20)
36+
.map(|_| {
37+
let ctx = BenchmarkCtx::new(n_gates_log);
38+
let proof = ctx.create_proof();
39+
(ctx, proof)
40+
})
41+
.collect();
2942

3043
group.bench_function(
3144
format!(
3245
"proof verification (SRS size 2^{{{}}}, {} gates)",
33-
ctx.srs_size(),
34-
ctx.num_gates
46+
inputs[0].0.srs_size(),
47+
1 << n_gates_log
3548
),
36-
|b| b.iter(|| ctx.batch_verification(black_box(&vec![proof_and_public.clone()]))),
49+
|b| {
50+
b.iter_batched(
51+
|| &inputs[rand::random::<usize>() % inputs.len()],
52+
|(ctx, proof)| black_box(ctx.batch_verification(std::slice::from_ref(proof))),
53+
criterion::BatchSize::LargeInput,
54+
)
55+
},
3756
);
3857
}
3958
}

kimchi/src/bench.rs

+7-8
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use mina_poseidon::{
77
sponge::{DefaultFqSponge, DefaultFrSponge},
88
};
99
use o1_utils::math;
10-
use poly_commitment::{commitment::CommitmentCurve, ipa::OpeningProof, SRS as _};
10+
use poly_commitment::{commitment::CommitmentCurve, ipa::OpeningProof, SRS};
1111

1212
use crate::{
1313
circuits::{
@@ -18,7 +18,6 @@ use crate::{
1818
proof::ProverProof,
1919
prover_index::{testing::new_index_for_test, ProverIndex},
2020
verifier::{batch_verify, Context},
21-
verifier_index::VerifierIndex,
2221
};
2322

2423
type SpongeParams = PlonkSpongeConstantsKimchi;
@@ -29,7 +28,6 @@ pub struct BenchmarkCtx {
2928
pub num_gates: usize,
3029
group_map: BWParameters<VestaParameters>,
3130
index: ProverIndex<Vesta, OpeningProof<Vesta>>,
32-
verifier_index: VerifierIndex<Vesta, OpeningProof<Vesta>>,
3331
}
3432

3533
impl BenchmarkCtx {
@@ -61,19 +59,20 @@ impl BenchmarkCtx {
6159
let group_map = <Vesta as CommitmentCurve>::Map::setup();
6260

6361
// create the index
64-
let index = new_index_for_test(gates, 0);
62+
let mut index = new_index_for_test(gates, 0);
6563

6664
assert_eq!(index.cs.domain.d1.log_size_of_group, srs_size_log2, "the test wanted to use an SRS of size {srs_size_log2} but the domain size ended up being {}", index.cs.domain.d1.log_size_of_group);
6765

6866
// create the verifier index
69-
let verifier_index = index.verifier_index();
67+
index.compute_verifier_index_digest::<BaseSponge>();
68+
69+
// just in case check that lagrange bases are generated
70+
index.srs.get_lagrange_basis(index.cs.domain.d1);
7071

71-
//
7272
BenchmarkCtx {
7373
num_gates,
7474
group_map,
7575
index,
76-
verifier_index,
7776
}
7877
}
7978

@@ -104,7 +103,7 @@ impl BenchmarkCtx {
104103
let batch: Vec<_> = batch
105104
.iter()
106105
.map(|(proof, public)| Context {
107-
verifier_index: &self.verifier_index,
106+
verifier_index: self.index.verifier_index.as_ref().unwrap(),
108107
proof,
109108
public_input: public,
110109
})

kimchi/src/circuits/constraints.rs

+45-16
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ use ark_poly::{
2626
use o1_utils::ExtendedEvaluations;
2727
use once_cell::sync::OnceCell;
2828
use poly_commitment::OpenProof;
29+
use rayon::prelude::*;
2930
use serde::{de::DeserializeOwned, Deserialize, Serialize};
3031
use serde_with::serde_as;
3132
use std::{array, default::Default, sync::Arc};
@@ -373,25 +374,53 @@ impl<F: PrimeField, G: KimchiCurve<ScalarField = F>, OpeningProof: OpenProof<G>>
373374
impl<F: PrimeField> ConstraintSystem<F> {
374375
/// evaluate witness polynomials over domains
375376
pub fn evaluate(&self, w: &[DP<F>; COLUMNS], z: &DP<F>) -> WitnessOverDomains<F> {
376-
// compute shifted witness polynomials
377-
let w8: [E<F, D<F>>; COLUMNS] =
378-
array::from_fn(|i| w[i].evaluate_over_domain_by_ref(self.domain.d8));
379-
let z8 = z.evaluate_over_domain_by_ref(self.domain.d8);
380-
381-
let w4: [E<F, D<F>>; COLUMNS] = array::from_fn(|i| {
382-
E::<F, D<F>>::from_vec_and_domain(
383-
(0..self.domain.d4.size)
384-
.map(|j| w8[i].evals[2 * j as usize])
385-
.collect(),
386-
self.domain.d4,
387-
)
388-
});
377+
// compute shifted witness polynomials and z8, all in parallel
378+
let (w8, z8): ([E<F, D<F>>; COLUMNS], _) = {
379+
let mut res = w
380+
.par_iter()
381+
.chain(rayon::iter::once(z))
382+
.map(|elem| elem.evaluate_over_domain_by_ref(self.domain.d8))
383+
.collect::<Vec<_>>();
384+
let z8 = res[COLUMNS].clone();
385+
res.truncate(COLUMNS);
386+
(res.try_into().unwrap(), z8)
387+
};
388+
389+
let w4: [E<F, D<F>>; COLUMNS] = (0..COLUMNS)
390+
.into_par_iter()
391+
.map(|i| {
392+
E::<F, D<F>>::from_vec_and_domain(
393+
(0..self.domain.d4.size)
394+
.map(|j| w8[i].evals[2 * j as usize])
395+
.collect(),
396+
self.domain.d4,
397+
)
398+
})
399+
.collect::<Vec<_>>()
400+
.try_into()
401+
.unwrap();
402+
389403
let z4 = DP::<F>::zero().evaluate_over_domain_by_ref(D::<F>::new(1).unwrap());
404+
let z8_shift8 = z8.shift(8);
405+
406+
let d4_next_w: [_; COLUMNS] = w4
407+
.par_iter()
408+
.map(|w4_i| w4_i.shift(4))
409+
.collect::<Vec<_>>()
410+
.try_into()
411+
.unwrap();
412+
413+
let d8_next_w: [_; COLUMNS] = w8
414+
.par_iter()
415+
.map(|w8_i| w8_i.shift(8))
416+
.collect::<Vec<_>>()
417+
.try_into()
418+
.unwrap();
390419

391420
WitnessOverDomains {
392421
d4: WitnessShifts {
393422
next: WitnessEvals {
394-
w: array::from_fn(|i| w4[i].shift(4)),
423+
w: d4_next_w,
395424
// TODO(mimoo): change z to an Option? Or maybe not, we might actually need this dummy evaluation in the aggregated evaluation proof
396425
z: z4.clone(), // dummy evaluation
397426
},
@@ -402,8 +431,8 @@ impl<F: PrimeField> ConstraintSystem<F> {
402431
},
403432
d8: WitnessShifts {
404433
next: WitnessEvals {
405-
w: array::from_fn(|i| w8[i].shift(8)),
406-
z: z8.shift(8),
434+
w: d8_next_w,
435+
z: z8_shift8,
407436
},
408437
this: WitnessEvals { w: w8, z: z8 },
409438
},

kimchi/src/circuits/polynomials/permutation.rs

+77-32
Original file line numberDiff line numberDiff line change
@@ -257,32 +257,45 @@ impl<F: PrimeField, G: KimchiCurve<ScalarField = F>, OpeningProof: OpenProof<G>>
257257
// (w[1](x) + gamma + x * beta * shift[1]) * ...
258258
// (w[6](x) + gamma + x * beta * shift[6])
259259
// in evaluation form in d8
260-
let mut shifts = lagrange.d8.this.z.clone();
261-
for (witness, shift) in lagrange.d8.this.w.iter().zip(self.cs.shift.iter()) {
262-
let term =
263-
&(witness + gamma) + &self.cs.precomputations().poly_x_d1.scale(beta * shift);
264-
shifts = &shifts * &term;
265-
}
260+
let shifts: Evaluations<F, D<F>> = &lagrange
261+
.d8
262+
.this
263+
.w
264+
.par_iter()
265+
.zip(self.cs.shift.par_iter())
266+
.map(|(witness, shift)| {
267+
&(witness + gamma) + &self.cs.precomputations().poly_x_d1.scale(beta * shift)
268+
})
269+
.reduce_with(|mut l, r| {
270+
l *= &r;
271+
l
272+
})
273+
.unwrap()
274+
* &lagrange.d8.this.z.clone();
266275

267276
// sigmas = z(x * w) *
268277
// (w8[0] + gamma + sigma[0] * beta) *
269278
// (w8[1] + gamma + sigma[1] * beta) * ...
270279
// (w8[6] + gamma + sigma[6] * beta)
271280
// in evaluation form in d8
272-
let mut sigmas = lagrange.d8.next.z.clone();
273-
for (witness, sigma) in lagrange
281+
let sigmas = &lagrange
274282
.d8
275283
.this
276284
.w
277-
.iter()
278-
.zip(self.column_evaluations.permutation_coefficients8.iter())
279-
{
280-
let term = witness + &(gamma + &sigma.scale(beta));
281-
sigmas = &sigmas * &term;
282-
}
283-
284-
&(&shifts - &sigmas).scale(alpha0)
285-
* &self.cs.precomputations().permutation_vanishing_polynomial_l
285+
.par_iter()
286+
.zip(self.column_evaluations.permutation_coefficients8.par_iter())
287+
.map(|(witness, sigma)| witness + &(gamma + &sigma.scale(beta)))
288+
.reduce_with(|mut l, r| {
289+
l *= &r;
290+
l
291+
})
292+
.unwrap()
293+
* &lagrange.d8.next.z.clone();
294+
295+
let res = &(&shifts - &sigmas).scale(alpha0)
296+
* &self.cs.precomputations().permutation_vanishing_polynomial_l;
297+
298+
res
286299
};
287300

288301
//~ and `bnd`:
@@ -324,7 +337,6 @@ impl<F: PrimeField, G: KimchiCurve<ScalarField = F>, OpeningProof: OpenProof<G>>
324337

325338
&bnd1.scale(alpha1) + &bnd2.scale(alpha2)
326339
};
327-
328340
Ok((perm, bnd))
329341
}
330342

@@ -433,8 +445,6 @@ impl<F: PrimeField, G: KimchiCurve<ScalarField = F>, OpeningProof: OpenProof<G>>
433445
//~ The first evaluation represents the initial value of the accumulator:
434446
//~ $$z(g^0) = 1$$
435447

436-
let mut z = vec![F::one(); n];
437-
438448
//~ For $i = 0, \cdot, n - 4$, where $n$ is the size of the domain,
439449
//~ evaluations are computed as:
440450
//~
@@ -468,27 +478,61 @@ impl<F: PrimeField, G: KimchiCurve<ScalarField = F>, OpeningProof: OpenProof<G>>
468478
//~ \end{align}
469479
//~ $$
470480
//~
471-
for j in 0..n - 1 {
472-
z[j + 1] = witness
473-
.iter()
474-
.zip(self.column_evaluations.permutation_coefficients8.iter())
475-
.map(|(w, s)| w[j] + (s[8 * j] * beta) + gamma)
476-
.fold(F::one(), |x, y| x * y);
477-
}
481+
482+
// We compute z such that:
483+
// z[0] = 1
484+
// z[j+1] = \Prod_{i=0}^{PERMUTS}(wit[i][j] + (s[i][8*j] * beta) + gamma) for j ∈ 0..n-1
485+
//
486+
// We compute every product batch separately first (one batch
487+
// per i∈[COLUMNS]), and then multiply all batches together.
488+
//
489+
// Note that we zip array of COLUMNS with array of PERMUTS;
490+
// Since PERMUTS < COLUMNS, that's what's actually used.
491+
let mut z: Vec<F> = witness
492+
.par_iter()
493+
.zip(self.column_evaluations.permutation_coefficients8.par_iter())
494+
.map(|(w_i, perm_coeffs8_i)| {
495+
let mut output_vec: Vec<_> = vec![F::one(); 1];
496+
for (j, w_i_j) in w_i.iter().enumerate().take(n - 1) {
497+
output_vec.push(*w_i_j + (perm_coeffs8_i[8 * j] * beta) + gamma);
498+
}
499+
output_vec
500+
})
501+
.reduce_with(|mut l, r| {
502+
for i in 0..n - 1 {
503+
l[i] *= &r[i];
504+
}
505+
l
506+
})
507+
.unwrap();
478508

479509
ark_ff::fields::batch_inversion::<F>(&mut z[1..n]);
480510

511+
let z_prefolded: Vec<F> = witness
512+
.par_iter()
513+
.zip(self.cs.shift.par_iter())
514+
.map(|(w_i, shift_i)| {
515+
let mut output_vec: Vec<_> = vec![F::one(); 1];
516+
for (j, w_i_j) in w_i.iter().enumerate().take(n - 1) {
517+
output_vec.push(*w_i_j + (self.cs.sid[j] * beta * shift_i) + gamma);
518+
}
519+
output_vec
520+
})
521+
.reduce_with(|mut l, r| {
522+
for i in 0..n - 1 {
523+
l[i] *= &r[i];
524+
}
525+
l
526+
})
527+
.unwrap();
528+
481529
//~ We randomize the evaluations at `n - zk_rows + 1` and `n - zk_rows + 2` in order to add
482530
//~ zero-knowledge to the protocol.
483531
//~
484532
for j in 0..n - 1 {
485533
if j != n - zk_rows && j != n - zk_rows + 1 {
486534
let x = z[j];
487-
z[j + 1] *= witness
488-
.iter()
489-
.zip(self.cs.shift.iter())
490-
.map(|(w, s)| w[j] + (self.cs.sid[j] * beta * s) + gamma)
491-
.fold(x, |z, y| z * y);
535+
z[j + 1] *= z_prefolded[j + 1] * x;
492536
} else {
493537
z[j + 1] = F::rand(rng);
494538
}
@@ -501,6 +545,7 @@ impl<F: PrimeField, G: KimchiCurve<ScalarField = F>, OpeningProof: OpenProof<G>>
501545
};
502546

503547
let res = Evaluations::<F, D<F>>::from_vec_and_domain(z, self.cs.domain.d1).interpolate();
548+
504549
Ok(res)
505550
}
506551
}

0 commit comments

Comments
 (0)