Skip to content

Commit e608742

Browse files
authored
chore: optimise polynomial initialisation (#10073)
Analysing the impact of using a large ambient trace (2^20) in the ClientIVC bench, with no changes to the circuit, one culprit is initalisation of polynomials defined over the full domain with 0. As such, I parallelised the initialisation function inside the polynomial class, which also brings improvement to the Client IVC bench as it is. Default benchmark NOW: ``` -------------------------------------------------------------------------------- Benchmark Time CPU -------------------------------------------------------------------------------- ClientIVCBench/Full/6 29956 ms 28100 ms ``` BEFORE: ``` -------------------------------------------------------------------------------- Benchmark Time CPU -------------------------------------------------------------------------------- ClientIVCBench/Full/6 32341 ms 30470 ms ``` Benchmark with 2^20 ambient trace NOW: ``` -------------------------------------------------------------------------------- Benchmark Time CPU -------------------------------------------------------------------------------- ClientIVCBench/Full/6 39013 ms 36526 ms ``` BEFORE: ``` -------------------------------------------------------------------------------- Benchmark Time CPU -------------------------------------------------------------------------------- ClientIVCBench/Full/6 44346 ms 41778 ms ``` Note: this is disabled for AVM as they do parallel polynomial construction and have smaller polynomials.
1 parent b8bace9 commit e608742

File tree

5 files changed

+78
-52
lines changed

5 files changed

+78
-52
lines changed

barretenberg/cpp/src/barretenberg/polynomials/polynomial.cpp

+22-2
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,29 @@ void Polynomial<Fr>::allocate_backing_memory(size_t size, size_t virtual_size, s
6060
*
6161
* @param size The size of the polynomial.
6262
*/
63-
template <typename Fr> Polynomial<Fr>::Polynomial(size_t size, size_t virtual_size, size_t start_index)
63+
template <typename Fr>
64+
Polynomial<Fr>::Polynomial(size_t size, size_t virtual_size, size_t start_index, bool disable_parallelisation)
6465
{
66+
PROFILE_THIS_NAME("polynomial allocation with zeroing");
67+
6568
allocate_backing_memory(size, virtual_size, start_index);
66-
memset(static_cast<void*>(coefficients_.backing_memory_.get()), 0, sizeof(Fr) * size);
69+
if (disable_parallelisation) {
70+
// In AVM polynomials are small and already constructed in parallel
71+
memset(static_cast<void*>(coefficients_.backing_memory_.get()), 0, sizeof(Fr) * size);
72+
return;
73+
}
74+
75+
size_t num_threads = calculate_num_threads(size);
76+
size_t range_per_thread = size / num_threads;
77+
size_t leftovers = size - (range_per_thread * num_threads);
78+
79+
parallel_for(num_threads, [&](size_t j) {
80+
size_t offset = j * range_per_thread;
81+
size_t range = (j == num_threads - 1) ? range_per_thread + leftovers : range_per_thread;
82+
ASSERT(offset < size || size == 0);
83+
ASSERT((offset + range) <= size);
84+
memset(static_cast<void*>(coefficients_.backing_memory_.get() + offset), 0, sizeof(Fr) * range);
85+
});
6786
}
6887

6988
/**
@@ -76,6 +95,7 @@ template <typename Fr> Polynomial<Fr>::Polynomial(size_t size, size_t virtual_si
7695
template <typename Fr>
7796
Polynomial<Fr>::Polynomial(size_t size, size_t virtual_size, size_t start_index, [[maybe_unused]] DontZeroMemory flag)
7897
{
98+
PROFILE_THIS_NAME("polynomial allocation without zeroing");
7999
allocate_backing_memory(size, virtual_size, start_index);
80100
}
81101

barretenberg/cpp/src/barretenberg/polynomials/polynomial.hpp

+3-5
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,11 @@ template <typename Fr> class Polynomial {
6565
using FF = Fr;
6666
enum class DontZeroMemory { FLAG };
6767

68-
Polynomial(size_t size, size_t virtual_size, size_t start_index = 0);
68+
Polynomial(size_t size, size_t virtual_size, size_t start_index = 0, bool disable_parallelisation = false);
6969
// Intended just for plonk, where size == virtual_size always
7070
Polynomial(size_t size)
71-
: Polynomial(size, size)
72-
{
73-
PROFILE_THIS();
74-
}
71+
: Polynomial(size, size){};
72+
7573
// Constructor that does not initialize values, use with caution to save time.
7674
Polynomial(size_t size, size_t virtual_size, size_t start_index, DontZeroMemory flag);
7775
Polynomial(size_t size, size_t virtual_size, DontZeroMemory flag)

barretenberg/cpp/src/barretenberg/ultra_honk/decider_proving_key.hpp

+6-6
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ template <IsUltraFlavor Flavor> class DeciderProvingKey_ {
9797
}
9898
{
9999

100-
PROFILE_THIS_NAME("constructing proving key");
100+
PROFILE_THIS_NAME("allocating proving key");
101101

102102
proving_key = ProvingKey(dyadic_circuit_size, circuit.public_inputs.size(), commitment_key);
103103
// If not using structured trace OR if using structured trace but overflow has occurred (overflow block in
@@ -189,27 +189,27 @@ template <IsUltraFlavor Flavor> class DeciderProvingKey_ {
189189
// Allocate the table polynomials
190190
if constexpr (IsUltraFlavor<Flavor>) {
191191
for (auto& poly : proving_key.polynomials.get_tables()) {
192-
poly = typename Flavor::Polynomial(max_tables_size, dyadic_circuit_size, table_offset);
192+
poly = Polynomial(max_tables_size, dyadic_circuit_size, table_offset);
193193
}
194194
}
195195
}
196196
{
197197
PROFILE_THIS_NAME("allocating sigmas and ids");
198198

199199
for (auto& sigma : proving_key.polynomials.get_sigmas()) {
200-
sigma = typename Flavor::Polynomial(proving_key.circuit_size);
200+
sigma = Polynomial(proving_key.circuit_size);
201201
}
202202
for (auto& id : proving_key.polynomials.get_ids()) {
203-
id = typename Flavor::Polynomial(proving_key.circuit_size);
203+
id = Polynomial(proving_key.circuit_size);
204204
}
205205
}
206206
{
207207
ZoneScopedN("allocating lookup read counts and tags");
208208
// Allocate the read counts and tags polynomials
209209
proving_key.polynomials.lookup_read_counts =
210-
typename Flavor::Polynomial(max_tables_size, dyadic_circuit_size, table_offset);
210+
Polynomial(max_tables_size, dyadic_circuit_size, table_offset);
211211
proving_key.polynomials.lookup_read_tags =
212-
typename Flavor::Polynomial(max_tables_size, dyadic_circuit_size, table_offset);
212+
Polynomial(max_tables_size, dyadic_circuit_size, table_offset);
213213
}
214214
{
215215
ZoneScopedN("allocating lookup and databus inverses");

barretenberg/cpp/src/barretenberg/vm/avm/generated/circuit_builder.cpp

+42-37
Original file line numberDiff line numberDiff line change
@@ -51,49 +51,54 @@ AvmCircuitBuilder::ProverPolynomials AvmCircuitBuilder::compute_polynomials() co
5151
}));
5252

5353
// catch-all with fully formed polynomials
54-
AVM_TRACK_TIME(
55-
"circuit_builder/init_polys_unshifted", ({
56-
auto unshifted = polys.get_unshifted();
54+
AVM_TRACK_TIME("circuit_builder/init_polys_unshifted", ({
55+
auto unshifted = polys.get_unshifted();
5756

58-
// An array which stores for each column of the trace the smallest size of the
59-
// truncated column containing all non-zero elements.
60-
// It is used to allocate the polynomials without memory overhead for the tail of zeros.
61-
std::array<size_t, Row::SIZE> col_nonzero_size{};
57+
// An array which stores for each column of the trace the smallest size of the
58+
// truncated column containing all non-zero elements.
59+
// It is used to allocate the polynomials without memory overhead for the tail of zeros.
60+
std::array<size_t, Row::SIZE> col_nonzero_size{};
6261

63-
// Computation of size of columns.
64-
// Non-parallel version takes 0.5 second for a trace size of 200k rows.
65-
// A parallel version might be considered in the future.
66-
for (size_t i = 0; i < num_rows; i++) {
67-
const auto row = rows[i].as_vector();
68-
for (size_t col = 0; col < Row::SIZE; col++) {
69-
if (!row[col].is_zero()) {
70-
col_nonzero_size[col] = i + 1;
71-
}
72-
}
73-
}
62+
// Computation of size of columns.
63+
// Non-parallel version takes 0.5 second for a trace size of 200k rows.
64+
// A parallel version might be considered in the future.
65+
for (size_t i = 0; i < num_rows; i++) {
66+
const auto row = rows[i].as_vector();
67+
for (size_t col = 0; col < Row::SIZE; col++) {
68+
if (!row[col].is_zero()) {
69+
col_nonzero_size[col] = i + 1;
70+
}
71+
}
72+
}
7473

75-
// Set of the labels for derived/inverse polynomials.
76-
const auto derived_labels = polys.get_derived_labels();
77-
std::set<std::string> derived_labels_set(derived_labels.begin(), derived_labels.end());
74+
// Set of the labels for derived/inverse polynomials.
75+
const auto derived_labels = polys.get_derived_labels();
76+
std::set<std::string> derived_labels_set(derived_labels.begin(), derived_labels.end());
7877

79-
bb::parallel_for(num_unshifted, [&](size_t i) {
80-
auto& poly = unshifted[i];
81-
const auto col_idx = polys_to_cols_unshifted_idx[i];
82-
size_t col_size = 0;
78+
bb::parallel_for(num_unshifted, [&](size_t i) {
79+
auto& poly = unshifted[i];
80+
const auto col_idx = polys_to_cols_unshifted_idx[i];
81+
size_t col_size = 0;
8382

84-
// We fully allocate the inverse polynomials. We leave this potential memory optimization for later.
85-
if (derived_labels_set.contains(labels[i])) {
86-
col_size = num_rows;
87-
} else {
88-
col_size = col_nonzero_size[col_idx];
89-
}
83+
// We fully allocate the inverse polynomials. We leave this potential memory optimization for
84+
// later.
85+
if (derived_labels_set.contains(labels[i])) {
86+
col_size = num_rows;
87+
} else {
88+
col_size = col_nonzero_size[col_idx];
89+
}
9090

91-
if (poly.is_empty()) {
92-
// Not set above
93-
poly = Polynomial{ /*memory size*/ col_size, /*largest possible index*/ circuit_subgroup_size };
94-
}
95-
});
96-
}));
91+
if (poly.is_empty()) {
92+
// Not set above
93+
poly = Polynomial{ /*memory size*/
94+
col_size,
95+
/*largest possible index as virtual size*/ circuit_subgroup_size,
96+
/*start_index=*/0,
97+
/*/*disable parallel initialisation=*/true
98+
};
99+
}
100+
});
101+
}));
97102

98103
AVM_TRACK_TIME(
99104
"circuit_builder/set_polys_unshifted", ({

bb-pilcom/bb-pil-backend/templates/circuit_builder.cpp.hbs

+5-2
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,11 @@ namespace bb {
8989

9090
if (poly.is_empty()) {
9191
// Not set above
92-
poly = Polynomial{ /*memory size*/ col_size, /*largest possible index*/ circuit_subgroup_size };
93-
}
92+
poly = Polynomial{ /*memory size*/ col_size,
93+
/*largest possible index as virtual size*/ circuit_subgroup_size,
94+
/*start_index=*/0,
95+
/*disable parallel initialization=*/true
96+
}; }
9497
});
9598
}));
9699

0 commit comments

Comments
 (0)