MB-63643: Fix missing num_threads clauses (#44)

CascadingRadium · abhinavdangeti · web-flow · commit 8eecdb636e99 · 2024-12-04T14:50:57.000-07:00
- In some places the #pragma omp statements were missing the num_threads
clause,
leading to the global OMP Config being ignored here

---------

Co-authored-by: Abhinav Dangeti &lt;abhinav@couchbase.com&gt;
diff --git a/faiss/IndexFastScan.cpp b/faiss/IndexFastScan.cpp
@@ -323,7 +323,7 @@ void IndexFastScan::search_dispatch_implem(
             }
         } else {
             // explicitly slice over threads
-#pragma omp parallel for num_threads(nt)
+#pragma omp parallel for num_threads(num_omp_threads)
             for (int slice = 0; slice < nt; slice++) {
                 idx_t i0 = n * slice / nt;
                 idx_t i1 = n * (slice + 1) / nt;
diff --git a/faiss/IndexIVF.cpp b/faiss/IndexIVF.cpp
@@ -1006,7 +1006,7 @@ void IndexIVF::search_and_reconstruct(
             labels,
             true /* store_pairs */,
             params);
-#pragma omp parallel for if (n * k > 1000)
+#pragma omp parallel for if (n * k > 1000) num_threads(num_omp_threads)
     for (idx_t ij = 0; ij < n * k; ij++) {
         idx_t key = labels[ij];
         float* reconstructed = recons + ij * d;
@@ -1068,7 +1068,7 @@ void IndexIVF::search_and_return_codes(
         code_size_1 += coarse_code_size();
     }
 
-#pragma omp parallel for if (n * k > 1000)
+#pragma omp parallel for if (n * k > 1000) num_threads(num_omp_threads)
     for (idx_t ij = 0; ij < n * k; ij++) {
         idx_t key = labels[ij];
         uint8_t* code1 = codes + ij * code_size_1;
diff --git a/faiss/IndexIVFFastScan.cpp b/faiss/IndexIVFFastScan.cpp
@@ -640,7 +640,7 @@ void IndexIVFFastScan::range_search_dispatch_implem(
     } else {
         // explicitly slice over threads
         int nslice = compute_search_nslice(this, n, cq.nprobe);
-#pragma omp parallel
+#pragma omp parallel num_threads(num_omp_threads)
         {
             RangeSearchPartialResult pres(&rres);
 
diff --git a/faiss/impl/PolysemousTraining.cpp b/faiss/impl/PolysemousTraining.cpp
@@ -779,7 +779,7 @@ void PolysemousTraining::optimize_reproduce_distances(
                 nt);
     }
 
-#pragma omp parallel for num_threads(nt)
+#pragma omp parallel for num_threads(num_omp_threads)
     for (int m = 0; m < pq.M; m++) {
         std::vector<double> dis_table;
 
diff --git a/faiss/impl/ProductQuantizer.cpp b/faiss/impl/ProductQuantizer.cpp
@@ -313,7 +313,7 @@ void ProductQuantizer::decode(const uint8_t* code, float* x) const {
 }
 
 void ProductQuantizer::decode(const uint8_t* code, float* x, size_t n) const {
-#pragma omp parallel for if (n > 100)
+#pragma omp parallel for if (n > 100) num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         this->decode(code + code_size * i, x + d * i);
     }
diff --git a/faiss/impl/residual_quantizer_encode_steps.cpp b/faiss/impl/residual_quantizer_encode_steps.cpp
@@ -275,7 +275,7 @@ void beam_search_encode_step(
     }
     InterruptCallback::check();
 
-#pragma omp parallel for if (n > 100)
+#pragma omp parallel for if (n > 100) num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         const int32_t* codes_i = codes + i * m * beam_size;
         int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size;
@@ -399,7 +399,7 @@ void beam_search_encode_step_tab(
 {
     FAISS_THROW_IF_NOT(ldc >= K);
 
-#pragma omp parallel for if (n > 100) schedule(dynamic)
+#pragma omp parallel for if (n > 100) schedule(dynamic) num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         std::vector<float> cent_distances(beam_size * K);
         std::vector<float> cd_common(K);
diff --git a/faiss/utils/distances.cpp b/faiss/utils/distances.cpp
@@ -146,7 +146,7 @@ void exhaustive_inner_product_seq(
 
     FAISS_ASSERT(use_sel == (sel != nullptr));
 
-#pragma omp parallel num_threads(nt)
+#pragma omp parallel num_threads(num_omp_threads)
     {
         SingleResultHandler resi(res);
 #pragma omp for
@@ -183,7 +183,7 @@ void exhaustive_L2sqr_seq(
 
     FAISS_ASSERT(use_sel == (sel != nullptr));
 
-#pragma omp parallel num_threads(nt)
+#pragma omp parallel num_threads(num_omp_threads)
     {
         SingleResultHandler resi(res);
 #pragma omp for
diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp
@@ -293,7 +293,7 @@ void hamming_range_search(
         int radius,
         size_t code_size,
         RangeSearchResult* res) {
-#pragma omp parallel
+#pragma omp parallel num_threads(num_omp_threads)
     {
         RangeSearchPartialResult pres(res);
 
@@ -687,7 +687,7 @@ void pack_bitstrings(
         uint8_t* packed,
         size_t code_size) {
     FAISS_THROW_IF_NOT(code_size >= (M * nbit + 7) / 8);
-#pragma omp parallel for if (n > 1000)
+#pragma omp parallel for if (n > 1000) num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         const int32_t* in = unpacked + i * M;
         uint8_t* out = packed + i * code_size;
@@ -710,7 +710,7 @@ void pack_bitstrings(
         totbit += nbit[j];
     }
     FAISS_THROW_IF_NOT(code_size >= (totbit + 7) / 8);
-#pragma omp parallel for if (n > 1000)
+#pragma omp parallel for if (n > 1000) num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         const int32_t* in = unpacked + i * M;
         uint8_t* out = packed + i * code_size;
@@ -729,7 +729,7 @@ void unpack_bitstrings(
         size_t code_size,
         int32_t* unpacked) {
     FAISS_THROW_IF_NOT(code_size >= (M * nbit + 7) / 8);
-#pragma omp parallel for if (n > 1000)
+#pragma omp parallel for if (n > 1000) num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         const uint8_t* in = packed + i * code_size;
         int32_t* out = unpacked + i * M;
@@ -752,7 +752,7 @@ void unpack_bitstrings(
         totbit += nbit[j];
     }
     FAISS_THROW_IF_NOT(code_size >= (totbit + 7) / 8);
-#pragma omp parallel for if (n > 1000)
+#pragma omp parallel for if (n > 1000) num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         const uint8_t* in = packed + i * code_size;
         int32_t* out = unpacked + i * M;
diff --git a/faiss/utils/sorting.cpp b/faiss/utils/sorting.cpp
@@ -61,7 +61,7 @@ void parallel_merge(
     s2s[nt - 1].i1 = s2.i1;
 
     // not sure parallel actually helps here
-#pragma omp parallel for num_threads(nt)
+#pragma omp parallel for num_threads(num_omp_threads)
     for (int t = 0; t < nt; t++) {
         s1s[t].i0 = s1.i0 + s1.len() * t / nt;
         s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
@@ -93,7 +93,7 @@ void parallel_merge(
     assert(sws[nt - 1].i1 == s1.i1);
 
     // do the actual merging
-#pragma omp parallel for num_threads(nt)
+#pragma omp parallel for num_threads(num_omp_threads)
     for (int t = 0; t < nt; t++) {
         SegmentS sw = sws[t];
         SegmentS s1t = s1s[t];
@@ -176,7 +176,7 @@ void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
         int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
         int sub_nseg1 = nseg / 2;
 
-#pragma omp parallel for num_threads(nseg1)
+#pragma omp parallel for num_threads(num_omp_threads)
         for (int s = 0; s < nseg; s += 2) {
             if (s + 1 == nseg) { // otherwise isolated segment
                 memcpy(permB + segs[s].i0,
@@ -257,7 +257,7 @@ void bucket_sort_parallel(
         int64_t* perm,
         int nt_in) {
     memset(lims, 0, sizeof(*lims) * (vmax + 1));
-#pragma omp parallel num_threads(nt_in)
+#pragma omp parallel num_threads(num_omp_threads)
     {
         int nt = omp_get_num_threads(); // might be different from nt_in
         int rank = omp_get_thread_num();
@@ -483,7 +483,7 @@ void bucket_sort_inplace_parallel(
             nbucket); // DON'T use std::vector<bool> that cannot be accessed
                       // safely from multiple threads!!!
 
-#pragma omp parallel num_threads(nt_in)
+#pragma omp parallel num_threads(num_omp_threads)
     {
         int nt = omp_get_num_threads(); // might be different from nt_in (?)
         int rank = omp_get_thread_num();
@@ -709,7 +709,7 @@ inline int64_t hash_function(int64_t x) {
 
 void hashtable_int64_to_int64_init(int log2_capacity, int64_t* tab) {
     size_t capacity = (size_t)1 << log2_capacity;
-#pragma omp parallel for
+#pragma omp parallel for num_threads(num_omp_threads)
     for (int64_t i = 0; i < capacity; i++) {
         tab[2 * i] = -1;
         tab[2 * i + 1] = -1;
@@ -729,7 +729,7 @@ void hashtable_int64_to_int64_add(
     int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity);
     size_t nbucket = (size_t)1 << log2_nbucket;
 
-#pragma omp parallel for
+#pragma omp parallel for num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         hk[i] = hash_function(keys[i]) & mask;
         bucket_no[i] = hk[i] >> (log2_capacity - log2_nbucket);
@@ -746,7 +746,7 @@ void hashtable_int64_to_int64_add(
             omp_get_max_threads());
 
     int num_errors = 0;
-#pragma omp parallel for reduction(+ : num_errors)
+#pragma omp parallel for reduction(+ : num_errors) num_threads(num_omp_threads)
     for (int64_t bucket = 0; bucket < nbucket; bucket++) {
         size_t k0 = bucket << (log2_capacity - log2_nbucket);
         size_t k1 = (bucket + 1) << (log2_capacity - log2_nbucket);
@@ -793,7 +793,7 @@ void hashtable_int64_to_int64_lookup(
     int64_t mask = capacity - 1;
     int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity);
 
-#pragma omp parallel for
+#pragma omp parallel for num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         int64_t k = keys[i];
         int64_t hk = hash_function(k) & mask;
diff --git a/faiss/utils/utils.cpp b/faiss/utils/utils.cpp
@@ -455,7 +455,7 @@ void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs) {
     // so below codes only accept n <= std::numeric_limits<ssize_t>::max()
     using ssize_t = std::make_signed<std::size_t>::type;
     const ssize_t size = n;
-#pragma omp parallel for if (size > 1000)
+#pragma omp parallel for if (size > 1000) num_threads(num_omp_threads)
     for (ssize_t i_ = 0; i_ < size; i_++) {
         const auto i = static_cast<std::size_t>(i_);
         cs[i] = bvec_checksum(d, a + i * d);

Original file line number	Diff line number	Diff line change
`@@ -323,7 +323,7 @@ void IndexFastScan::search_dispatch_implem(`
`323`	`323`	`}`
`324`	`324`	`} else {`
`325`	`325`	`// explicitly slice over threads`
`326`		`-#pragma omp parallel for num_threads(nt)`
	`326`	`+#pragma omp parallel for num_threads(num_omp_threads)`
`327`	`327`	`for (int slice = 0; slice < nt; slice++) {`
`328`	`328`	`idx_t i0 = n * slice / nt;`
`329`	`329`	`idx_t i1 = n * (slice + 1) / nt;`
Original file line number	Diff line number	Diff line change
`@@ -640,7 +640,7 @@ void IndexIVFFastScan::range_search_dispatch_implem(`
`640`	`640`	`} else {`
`641`	`641`	`// explicitly slice over threads`
`642`	`642`	`int nslice = compute_search_nslice(this, n, cq.nprobe);`
`643`		`-#pragma omp parallel`
	`643`	`+#pragma omp parallel num_threads(num_omp_threads)`
`644`	`644`	`{`
`645`	`645`	`RangeSearchPartialResult pres(&rres);`
`646`	`646`
Original file line number	Diff line number	Diff line change
`@@ -779,7 +779,7 @@ void PolysemousTraining::optimize_reproduce_distances(`
`779`	`779`	`nt);`
`780`	`780`	`}`
`781`	`781`
`782`		`-#pragma omp parallel for num_threads(nt)`
	`782`	`+#pragma omp parallel for num_threads(num_omp_threads)`
`783`	`783`	`for (int m = 0; m < pq.M; m++) {`
`784`	`784`	`std::vector<double> dis_table;`
`785`	`785`
Original file line number	Diff line number	Diff line change
`@@ -313,7 +313,7 @@ void ProductQuantizer::decode(const uint8_t* code, float* x) const {`
`313`	`313`	`}`
`314`	`314`
`315`	`315`	`void ProductQuantizer::decode(const uint8_t* code, float* x, size_t n) const {`
`316`		`-#pragma omp parallel for if (n > 100)`
	`316`	`+#pragma omp parallel for if (n > 100) num_threads(num_omp_threads)`
`317`	`317`	`for (int64_t i = 0; i < n; i++) {`
`318`	`318`	`this->decode(code + code_size * i, x + d * i);`
`319`	`319`	`}`
Original file line number	Diff line number	Diff line change
`@@ -146,7 +146,7 @@ void exhaustive_inner_product_seq(`
`146`	`146`
`147`	`147`	`FAISS_ASSERT(use_sel == (sel != nullptr));`
`148`	`148`
`149`		`-#pragma omp parallel num_threads(nt)`
	`149`	`+#pragma omp parallel num_threads(num_omp_threads)`
`150`	`150`	`{`
`151`	`151`	`SingleResultHandler resi(res);`
`152`	`152`	`#pragma omp for`
`@@ -183,7 +183,7 @@ void exhaustive_L2sqr_seq(`
`183`	`183`
`184`	`184`	`FAISS_ASSERT(use_sel == (sel != nullptr));`
`185`	`185`
`186`		`-#pragma omp parallel num_threads(nt)`
	`186`	`+#pragma omp parallel num_threads(num_omp_threads)`
`187`	`187`	`{`
`188`	`188`	`SingleResultHandler resi(res);`
`189`	`189`	`#pragma omp for`