From fddbd3e7b3502d72ff890cb8529ec295636ef649 Mon Sep 17 00:00:00 2001 From: Gustav von Zitzewitz Date: Wed, 4 Dec 2024 11:41:12 +0100 Subject: [PATCH 01/14] Add support for search params to IndexBinaryFlat --- faiss/IndexBinaryFlat.cpp | 16 ++++--- .../approx_topk_hamming/approx_topk_hamming.h | 40 +++++++++++----- faiss/utils/hamming.cpp | 46 ++++++++++++------- 3 files changed, 67 insertions(+), 35 deletions(-) diff --git a/faiss/IndexBinaryFlat.cpp b/faiss/IndexBinaryFlat.cpp index f6e2e218c0..78ffb54c17 100644 --- a/faiss/IndexBinaryFlat.cpp +++ b/faiss/IndexBinaryFlat.cpp @@ -37,8 +37,8 @@ void IndexBinaryFlat::search( int32_t* distances, idx_t* labels, const SearchParameters* params) const { - FAISS_THROW_IF_NOT_MSG( - !params, "search params not supported for this index"); + // Extract IDSelector from params if present + const IDSelector* sel = params ? params->sel : nullptr; FAISS_THROW_IF_NOT(k > 0); const idx_t block_size = query_batch_size; @@ -60,7 +60,9 @@ void IndexBinaryFlat::search( ntotal, code_size, /* ordered = */ true, - approx_topk_mode); + /* init_heap = */ true, + approx_topk_mode, + sel); } else { hammings_knn_mc( x + s * code_size, @@ -70,7 +72,8 @@ void IndexBinaryFlat::search( k, code_size, distances + s * k, - labels + s * k); + labels + s * k, + sel); } } } @@ -107,9 +110,8 @@ void IndexBinaryFlat::range_search( int radius, RangeSearchResult* result, const SearchParameters* params) const { - FAISS_THROW_IF_NOT_MSG( - !params, "search params not supported for this index"); - hamming_range_search(x, xb.data(), n, ntotal, radius, code_size, result); + const IDSelector* sel = params ? params->sel : nullptr; + hamming_range_search(x, xb.data(), n, ntotal, radius, code_size, result, sel); } } // namespace faiss diff --git a/faiss/utils/approx_topk_hamming/approx_topk_hamming.h b/faiss/utils/approx_topk_hamming/approx_topk_hamming.h index 68d8e8c9f0..5ee2930a01 100644 --- a/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +++ b/faiss/utils/approx_topk_hamming/approx_topk_hamming.h @@ -46,9 +46,11 @@ struct HeapWithBucketsForHamming32< // output distances int* const __restrict bh_val, // output indices, each being within [0, n) range - int64_t* const __restrict bh_ids) { + int64_t* const __restrict bh_ids, + // optional id selector for filtering + const IDSelector* sel = nullptr) { // forward a call to bs_addn with 1 beam - bs_addn(1, n, hc, binaryVectors, k, bh_val, bh_ids); + bs_addn(1, n, hc, binaryVectors, k, bh_val, bh_ids, sel); } static void bs_addn( @@ -66,7 +68,9 @@ struct HeapWithBucketsForHamming32< int* const __restrict bh_val, // output indices, each being within [0, n_per_beam * beam_size) // range - int64_t* const __restrict bh_ids) { + int64_t* const __restrict bh_ids, + // optional id selector for filtering + const IDSelector* sel = nullptr) { // using C = CMax; @@ -95,11 +99,20 @@ struct HeapWithBucketsForHamming32< for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) { for (uint32_t j = 0; j < NBUCKETS_8; j++) { uint32_t hamming_distances[8]; + uint32_t valid_mask = 0; for (size_t j8 = 0; j8 < 8; j8++) { - hamming_distances[j8] = hc.hamming( - binary_vectors + - (j8 + j * 8 + ip + n_per_beam * beam_index) * - code_size); + const uint32_t idx = j8 + j * 8 + ip + n_per_beam * beam_index; + if (!sel || sel->is_member(idx)) { + hamming_distances[j8] = hc.hamming( + binary_vectors + idx * code_size); + valid_mask |= (1 << j8); + } else { + hamming_distances[j8] = std::numeric_limits::max(); + } + } + + if (valid_mask == 0) { + continue; // Skip if all vectors are filtered out } // loop. Compiler should get rid of unneeded ops @@ -157,7 +170,8 @@ struct HeapWithBucketsForHamming32< const auto value = min_distances_scalar[j8]; const auto index = min_indices_scalar[j8]; - if (C::cmp2(bh_val[0], value, bh_ids[0], index)) { + if (value < std::numeric_limits::max() && + C::cmp2(bh_val[0], value, bh_ids[0], index)) { heap_replace_top( k, bh_val, bh_ids, value, index); } @@ -168,11 +182,13 @@ struct HeapWithBucketsForHamming32< // process leftovers for (uint32_t ip = nb; ip < n_per_beam; ip++) { const auto index = ip + n_per_beam * beam_index; - const auto value = - hc.hamming(binary_vectors + (index)*code_size); + if (!sel || sel->is_member(index)) { + const auto value = + hc.hamming(binary_vectors + (index)*code_size); - if (C::cmp(bh_val[0], value)) { - heap_replace_top(k, bh_val, bh_ids, value, index); + if (C::cmp(bh_val[0], value)) { + heap_replace_top(k, bh_val, bh_ids, value, index); + } } } } diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp index 402f1f117c..10f72d6a0c 100644 --- a/faiss/utils/hamming.cpp +++ b/faiss/utils/hamming.cpp @@ -170,9 +170,10 @@ void hammings_knn_hc( const uint8_t* __restrict bs1, const uint8_t* __restrict bs2, size_t n2, - bool order = true, - bool init_heap = true, - ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK) { + bool order, + bool init_heap, + ApproxTopK_mode_t approx_topk_mode, + const IDSelector* sel) { size_t k = ha->k; if (init_heap) ha->heapify(); @@ -205,7 +206,7 @@ void hammings_knn_hc( NB, \ BD, \ HammingComputer>:: \ - addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_); \ + addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_, sel); \ break; switch (approx_topk_mode) { @@ -215,6 +216,9 @@ void hammings_knn_hc( HANDLE_APPROX(32, 2) default: { for (size_t j = j0; j < j1; j++, bs2_ += bytes_per_code) { + if (sel && !sel->is_member(j)) { + continue; + } dis = hc.hamming(bs2_); if (dis < bh_val_[0]) { faiss::maxheap_replace_top( @@ -239,7 +243,8 @@ void hammings_knn_mc( size_t nb, size_t k, int32_t* __restrict distances, - int64_t* __restrict labels) { + int64_t* __restrict labels, + const IDSelector* sel) { const int nBuckets = bytes_per_code * 8 + 1; std::vector all_counters(na * nBuckets, 0); std::unique_ptr all_ids_per_dis(new int64_t[na * nBuckets * k]); @@ -260,7 +265,9 @@ void hammings_knn_mc( #pragma omp parallel for for (int64_t i = 0; i < na; ++i) { for (size_t j = j0; j < j1; ++j) { - cs[i].update_counter(b + j * bytes_per_code, j); + if (!sel || sel->is_member(j)) { + cs[i].update_counter(b + j * bytes_per_code, j); + } } } } @@ -292,7 +299,8 @@ void hamming_range_search( size_t nb, int radius, size_t code_size, - RangeSearchResult* res) { + RangeSearchResult* res, + const IDSelector* sel) { #pragma omp parallel { RangeSearchPartialResult pres(res); @@ -304,9 +312,11 @@ void hamming_range_search( RangeQueryResult& qres = pres.new_result(i); for (size_t j = 0; j < nb; j++) { - int dis = hc.hamming(yi); - if (dis < radius) { - qres.add(dis, j); + if (!sel || sel->is_member(j)) { + int dis = hc.hamming(yi); + if (dis < radius) { + qres.add(dis, j); + } } yi += code_size; } @@ -490,10 +500,12 @@ void hammings_knn_hc( size_t nb, size_t ncodes, int order, - ApproxTopK_mode_t approx_topk_mode) { + bool init_heap, + ApproxTopK_mode_t approx_topk_mode, + const IDSelector* sel) { Run_hammings_knn_hc r; dispatch_HammingComputer( - ncodes, r, ncodes, ha, a, b, nb, order, true, approx_topk_mode); + ncodes, r, ncodes, ha, a, b, nb, order, init_heap, approx_topk_mode, sel); } void hammings_knn_mc( @@ -504,10 +516,11 @@ void hammings_knn_mc( size_t k, size_t ncodes, int32_t* __restrict distances, - int64_t* __restrict labels) { + int64_t* __restrict labels, + const IDSelector* sel) { Run_hammings_knn_mc r; dispatch_HammingComputer( - ncodes, r, ncodes, a, b, na, nb, k, distances, labels); + ncodes, r, ncodes, a, b, na, nb, k, distances, labels, sel); } void hamming_range_search( @@ -517,10 +530,11 @@ void hamming_range_search( size_t nb, int radius, size_t code_size, - RangeSearchResult* result) { + RangeSearchResult* result, + const IDSelector* sel = nullptr) { Run_hamming_range_search r; dispatch_HammingComputer( - code_size, r, a, b, na, nb, radius, code_size, result); + code_size, r, a, b, na, nb, radius, code_size, result, sel); } /* Count number of matches given a max threshold */ From bf90fc1a0a7eb134bd9e303d0c8f482c58f55fa7 Mon Sep 17 00:00:00 2001 From: Gustav von Zitzewitz Date: Wed, 4 Dec 2024 12:00:12 +0100 Subject: [PATCH 02/14] update tests --- tests/test_search_params.py | 60 ++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/tests/test_search_params.py b/tests/test_search_params.py index 18436edf4d..4337a70743 100644 --- a/tests/test_search_params.py +++ b/tests/test_search_params.py @@ -27,9 +27,25 @@ def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METR members according to the IDSelector. Supports id_selector_type="batch", "bitmap", "range", "range_sorted", "and", "or", "xor" """ - ds = datasets.SyntheticDataset(32, 1000, 100, 20) - index = faiss.index_factory(ds.d, index_key, mt) - index.train(ds.get_train()) + d = 32 # make sure dimension is multiple of 8 for binary + ds = datasets.SyntheticDataset(d, 1000, 100, 20) + + if index_key == "BinaryFlat": + # Create proper binary vectors following test_index_binary.py pattern + rs = np.random.RandomState(123) + xb = rs.randint(256, size=(ds.nb, d // 8), dtype='uint8') + xq = rs.randint(256, size=(ds.nq, d // 8), dtype='uint8') + xt = None # No training needed for binary flat + index = faiss.IndexBinaryFlat(d) + # Use smaller radius for Hamming distance + base_radius = 4 + else: + xb = ds.get_database() + xq = ds.get_queries() + xt = ds.get_train() + index = faiss.index_factory(d, index_key, mt) + index.train(xt) + base_radius = float('inf') # Will be set based on results # reference result if "range" in id_selector_type: @@ -54,20 +70,20 @@ def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METR subset = np.setxor1d(lhs_subset, rhs_subset) else: rs = np.random.RandomState(123) - subset = rs.choice(ds.nb, 50, replace=False).astype("int64") - # add_with_ids not supported for all index types - # index.add_with_ids(ds.get_database()[subset], subset) - index.add(ds.get_database()[subset]) - if "IVF" in index_key and id_selector_type == "range_sorted": - self.assertTrue(index.check_ids_sorted()) - Dref, Iref0 = index.search(ds.get_queries(), k) + subset = rs.choice(ds.nb, 50, replace=False).astype('int64') + + index.add(xb[subset]) + Dref, Iref0 = index.search(xq, k) Iref = subset[Iref0] Iref[Iref0 < 0] = -1 - radius = float(Dref[Iref > 0].max()) * 1.01 + if base_radius == float('inf'): + radius = float(Dref[Iref > 0].max()) * 1.01 + else: + radius = base_radius + try: - Rlims_ref, RDref, RIref = index.range_search( - ds.get_queries(), radius) + Rlims_ref, RDref, RIref = index.range_search(xq, radius) except RuntimeError as e: if "not implemented" in str(e): have_range_search = False @@ -81,7 +97,7 @@ def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METR # result with selector: fill full database and search with selector index.reset() - index.add(ds.get_database()) + index.add(xb) if id_selector_type == "range": sel = faiss.IDSelectorRange(30, 80) elif id_selector_type == "range_sorted": @@ -123,13 +139,12 @@ def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METR faiss.SearchParametersPQ(sel=sel) if "PQ" in index_key else faiss.SearchParameters(sel=sel) ) - Dnew, Inew = index.search(ds.get_queries(), k, params=params) + Dnew, Inew = index.search(xq, k, params=params) np.testing.assert_array_equal(Iref, Inew) np.testing.assert_almost_equal(Dref, Dnew, decimal=5) if have_range_search: - Rlims_new, RDnew, RInew = index.range_search( - ds.get_queries(), radius, params=params) + Rlims_new, RDnew, RInew = index.range_search(xq, radius, params=params) np.testing.assert_array_equal(Rlims_ref, Rlims_new) RDref, RIref = sort_range_res_2(Rlims_ref, RDref, RIref) np.testing.assert_array_equal(RIref, RInew) @@ -284,6 +299,15 @@ def test_bounds(self): distances, indices = index_ip.search(xb[:2], k=3, params=search_params) distances, indices = index_l2.search(xb[:2], k=3, params=search_params) + def test_BinaryFlat(self): + self.do_test_id_selector("BinaryFlat") + + def test_BinaryFlat_id_range(self): + self.do_test_id_selector("BinaryFlat", id_selector_type="range") + + def test_BinaryFlat_id_array(self): + self.do_test_id_selector("BinaryFlat", id_selector_type="array") + class TestSearchParams(unittest.TestCase): @@ -504,4 +528,4 @@ def test_knn_and_range_PQ(self): self.do_test_knn_and_range("IVF32,PQ8x4np") def test_knn_and_range_FS(self): - self.do_test_knn_and_range("IVF32,PQ8x4fs", range=False) + self.do_test_knn_and_range("IVF32,PQ8x4fs", range=False) \ No newline at end of file From c5c9cab09b90ec187a9a786051623171a2d58489 Mon Sep 17 00:00:00 2001 From: Gustav von Zitzewitz Date: Wed, 4 Dec 2024 13:39:34 +0100 Subject: [PATCH 03/14] revert default param changes --- faiss/IndexBinaryFlat.cpp | 1 - faiss/utils/hamming.cpp | 13 ++++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/faiss/IndexBinaryFlat.cpp b/faiss/IndexBinaryFlat.cpp index 78ffb54c17..4f5a558caa 100644 --- a/faiss/IndexBinaryFlat.cpp +++ b/faiss/IndexBinaryFlat.cpp @@ -60,7 +60,6 @@ void IndexBinaryFlat::search( ntotal, code_size, /* ordered = */ true, - /* init_heap = */ true, approx_topk_mode, sel); } else { diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp index 10f72d6a0c..04e94e3120 100644 --- a/faiss/utils/hamming.cpp +++ b/faiss/utils/hamming.cpp @@ -170,10 +170,10 @@ void hammings_knn_hc( const uint8_t* __restrict bs1, const uint8_t* __restrict bs2, size_t n2, - bool order, - bool init_heap, - ApproxTopK_mode_t approx_topk_mode, - const IDSelector* sel) { + bool order = true, + bool init_heap = true, + ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK + const IDSelector* sel = nullptr) { size_t k = ha->k; if (init_heap) ha->heapify(); @@ -500,12 +500,11 @@ void hammings_knn_hc( size_t nb, size_t ncodes, int order, - bool init_heap, - ApproxTopK_mode_t approx_topk_mode, + ApproxTopK_mode_t approx_topk_mode const IDSelector* sel) { Run_hammings_knn_hc r; dispatch_HammingComputer( - ncodes, r, ncodes, ha, a, b, nb, order, init_heap, approx_topk_mode, sel); + ncodes, r, ncodes, ha, a, b, nb, order, true, approx_topk_mode, sel); } void hammings_knn_mc( From d70e64b27a24a48c1e425fcb44fbc3d0a7d5147b Mon Sep 17 00:00:00 2001 From: Gustav von Zitzewitz Date: Thu, 12 Dec 2024 09:21:15 +0100 Subject: [PATCH 04/14] add missing sel to hamming.h, add no heap test case, simplify valid_counter --- .../approx_topk_hamming/approx_topk_hamming.h | 6 ++--- faiss/utils/hamming.h | 9 ++++--- tests/test_search_params.py | 27 ++++++++++++++----- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/faiss/utils/approx_topk_hamming/approx_topk_hamming.h b/faiss/utils/approx_topk_hamming/approx_topk_hamming.h index 5ee2930a01..4efd24d7c1 100644 --- a/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +++ b/faiss/utils/approx_topk_hamming/approx_topk_hamming.h @@ -99,19 +99,19 @@ struct HeapWithBucketsForHamming32< for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) { for (uint32_t j = 0; j < NBUCKETS_8; j++) { uint32_t hamming_distances[8]; - uint32_t valid_mask = 0; + uint8_t valid_counter = 0; for (size_t j8 = 0; j8 < 8; j8++) { const uint32_t idx = j8 + j * 8 + ip + n_per_beam * beam_index; if (!sel || sel->is_member(idx)) { hamming_distances[j8] = hc.hamming( binary_vectors + idx * code_size); - valid_mask |= (1 << j8); + valid_counter++; } else { hamming_distances[j8] = std::numeric_limits::max(); } } - if (valid_mask == 0) { + if (valid_counter == 0) { continue; // Skip if all vectors are filtered out } diff --git a/faiss/utils/hamming.h b/faiss/utils/hamming.h index 85f9730e5c..4d72218c70 100644 --- a/faiss/utils/hamming.h +++ b/faiss/utils/hamming.h @@ -135,7 +135,8 @@ void hammings_knn_hc( size_t nb, size_t ncodes, int ordered, - ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK); + ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK, + const IDSelector* sel = nullptr); /* Legacy alias to hammings_knn_hc. */ void hammings_knn( @@ -166,7 +167,8 @@ void hammings_knn_mc( size_t k, size_t ncodes, int32_t* distances, - int64_t* labels); + int64_t* labels, + const IDSelector* sel = nullptr); /** same as hammings_knn except we are doing a range search with radius */ void hamming_range_search( @@ -176,7 +178,8 @@ void hamming_range_search( size_t nb, int radius, size_t ncodes, - RangeSearchResult* result); + RangeSearchResult* result, + const IDSelector* sel = nullptr); /* Counting the number of matches or of cross-matches (without returning them) For use with function that assume pre-allocated memory */ diff --git a/tests/test_search_params.py b/tests/test_search_params.py index 4337a70743..37649ef3d6 100644 --- a/tests/test_search_params.py +++ b/tests/test_search_params.py @@ -22,10 +22,11 @@ class TestSelector(unittest.TestCase): combinations as possible. """ - def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METRIC_L2, k=10): + def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METRIC_L2, k=10, params=None): """ Verify that the id selector returns the subset of results that are members according to the IDSelector. Supports id_selector_type="batch", "bitmap", "range", "range_sorted", "and", "or", "xor" + params: optional SearchParameters object to override default settings """ d = 32 # make sure dimension is multiple of 8 for binary ds = datasets.SyntheticDataset(d, 1000, 100, 20) @@ -73,6 +74,8 @@ def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METR subset = rs.choice(ds.nb, 50, replace=False).astype('int64') index.add(xb[subset]) + if "IVF" in index_key and id_selector_type == "range_sorted": + self.assertTrue(index.check_ids_sorted()) Dref, Iref0 = index.search(xq, k) Iref = subset[Iref0] Iref[Iref0 < 0] = -1 @@ -134,11 +137,16 @@ def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METR else: sel = faiss.IDSelectorBatch(subset) - params = ( - faiss.SearchParametersIVF(sel=sel) if "IVF" in index_key else - faiss.SearchParametersPQ(sel=sel) if "PQ" in index_key else - faiss.SearchParameters(sel=sel) - ) + if params is None: + params = ( + faiss.SearchParametersIVF(sel=sel) if "IVF" in index_key else + faiss.SearchParametersPQ(sel=sel) if "PQ" in index_key else + faiss.SearchParameters(sel=sel) + ) + else: + # Use provided params but ensure selector is set + params.sel = sel + Dnew, Inew = index.search(xq, k, params=params) np.testing.assert_array_equal(Iref, Inew) np.testing.assert_almost_equal(Dref, Dnew, decimal=5) @@ -308,6 +316,11 @@ def test_BinaryFlat_id_range(self): def test_BinaryFlat_id_array(self): self.do_test_id_selector("BinaryFlat", id_selector_type="array") + def test_BinaryFlat_no_heap(self): + params = faiss.SearchParameters() + params.use_heap = False + self.do_test_id_selector("BinaryFlat", params=params) + class TestSearchParams(unittest.TestCase): @@ -528,4 +541,4 @@ def test_knn_and_range_PQ(self): self.do_test_knn_and_range("IVF32,PQ8x4np") def test_knn_and_range_FS(self): - self.do_test_knn_and_range("IVF32,PQ8x4fs", range=False) \ No newline at end of file + self.do_test_knn_and_range("IVF32,PQ8x4fs", range=False) From cd8b5d3e751694e8f7edc4e7c79746ab0e827684 Mon Sep 17 00:00:00 2001 From: Gustav von Zitzewitz Date: Mon, 16 Dec 2024 09:47:07 +0100 Subject: [PATCH 05/14] fix import, no heap test, linting --- faiss/IndexBinaryFlat.cpp | 3 +- .../approx_topk_hamming/approx_topk_hamming.h | 8 +++-- faiss/utils/hamming.cpp | 16 ++++++++-- faiss/utils/hamming.h | 1 + tests/test_search_params.py | 32 +++++++++---------- 5 files changed, 36 insertions(+), 24 deletions(-) diff --git a/faiss/IndexBinaryFlat.cpp b/faiss/IndexBinaryFlat.cpp index 4f5a558caa..bbb51d7c93 100644 --- a/faiss/IndexBinaryFlat.cpp +++ b/faiss/IndexBinaryFlat.cpp @@ -110,7 +110,8 @@ void IndexBinaryFlat::range_search( RangeSearchResult* result, const SearchParameters* params) const { const IDSelector* sel = params ? params->sel : nullptr; - hamming_range_search(x, xb.data(), n, ntotal, radius, code_size, result, sel); + hamming_range_search( + x, xb.data(), n, ntotal, radius, code_size, result, sel); } } // namespace faiss diff --git a/faiss/utils/approx_topk_hamming/approx_topk_hamming.h b/faiss/utils/approx_topk_hamming/approx_topk_hamming.h index 4efd24d7c1..40539b361f 100644 --- a/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +++ b/faiss/utils/approx_topk_hamming/approx_topk_hamming.h @@ -101,17 +101,19 @@ struct HeapWithBucketsForHamming32< uint32_t hamming_distances[8]; uint8_t valid_counter = 0; for (size_t j8 = 0; j8 < 8; j8++) { - const uint32_t idx = j8 + j * 8 + ip + n_per_beam * beam_index; + const uint32_t idx = + j8 + j * 8 + ip + n_per_beam * beam_index; if (!sel || sel->is_member(idx)) { hamming_distances[j8] = hc.hamming( binary_vectors + idx * code_size); valid_counter++; } else { - hamming_distances[j8] = std::numeric_limits::max(); + hamming_distances[j8] = + std::numeric_limits::max(); } } - if (valid_counter == 0) { + if (valid_counter == 8) { continue; // Skip if all vectors are filtered out } diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp index 04e94e3120..98fca2f3a7 100644 --- a/faiss/utils/hamming.cpp +++ b/faiss/utils/hamming.cpp @@ -172,7 +172,7 @@ void hammings_knn_hc( size_t n2, bool order = true, bool init_heap = true, - ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK + ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK, const IDSelector* sel = nullptr) { size_t k = ha->k; if (init_heap) @@ -500,11 +500,21 @@ void hammings_knn_hc( size_t nb, size_t ncodes, int order, - ApproxTopK_mode_t approx_topk_mode + ApproxTopK_mode_t approx_topk_mode, const IDSelector* sel) { Run_hammings_knn_hc r; dispatch_HammingComputer( - ncodes, r, ncodes, ha, a, b, nb, order, true, approx_topk_mode, sel); + ncodes, + r, + ncodes, + ha, + a, + b, + nb, + order, + true, + approx_topk_mode, + sel); } void hammings_knn_mc( diff --git a/faiss/utils/hamming.h b/faiss/utils/hamming.h index 4d72218c70..49a43512af 100644 --- a/faiss/utils/hamming.h +++ b/faiss/utils/hamming.h @@ -27,6 +27,7 @@ #include +#include #include #include diff --git a/tests/test_search_params.py b/tests/test_search_params.py index 37649ef3d6..af159d8a99 100644 --- a/tests/test_search_params.py +++ b/tests/test_search_params.py @@ -22,22 +22,27 @@ class TestSelector(unittest.TestCase): combinations as possible. """ - def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METRIC_L2, k=10, params=None): + def do_test_id_selector( + self, + index_key, + id_selector_type="batch", + mt=faiss.METRIC_L2, + k=10, + use_heap=True + ): """ Verify that the id selector returns the subset of results that are members according to the IDSelector. Supports id_selector_type="batch", "bitmap", "range", "range_sorted", "and", "or", "xor" - params: optional SearchParameters object to override default settings """ d = 32 # make sure dimension is multiple of 8 for binary ds = datasets.SyntheticDataset(d, 1000, 100, 20) if index_key == "BinaryFlat": - # Create proper binary vectors following test_index_binary.py pattern rs = np.random.RandomState(123) xb = rs.randint(256, size=(ds.nb, d // 8), dtype='uint8') xq = rs.randint(256, size=(ds.nq, d // 8), dtype='uint8') - xt = None # No training needed for binary flat index = faiss.IndexBinaryFlat(d) + index.use_heap = use_heap # Use smaller radius for Hamming distance base_radius = 4 else: @@ -137,15 +142,11 @@ def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METR else: sel = faiss.IDSelectorBatch(subset) - if params is None: - params = ( - faiss.SearchParametersIVF(sel=sel) if "IVF" in index_key else - faiss.SearchParametersPQ(sel=sel) if "PQ" in index_key else - faiss.SearchParameters(sel=sel) - ) - else: - # Use provided params but ensure selector is set - params.sel = sel + params = ( + faiss.SearchParametersIVF(sel=sel) if "IVF" in index_key else + faiss.SearchParametersPQ(sel=sel) if "PQ" in index_key else + faiss.SearchParameters(sel=sel) + ) Dnew, Inew = index.search(xq, k, params=params) np.testing.assert_array_equal(Iref, Inew) @@ -317,10 +318,7 @@ def test_BinaryFlat_id_array(self): self.do_test_id_selector("BinaryFlat", id_selector_type="array") def test_BinaryFlat_no_heap(self): - params = faiss.SearchParameters() - params.use_heap = False - self.do_test_id_selector("BinaryFlat", params=params) - + self.do_test_id_selector("BinaryFlat", use_heap=False) class TestSearchParams(unittest.TestCase): From e96237e9a59da2ff63e0f73c3ff872a9e599f711 Mon Sep 17 00:00:00 2001 From: Gustav von Zitzewitz Date: Thu, 19 Dec 2024 08:10:15 +0100 Subject: [PATCH 06/14] lint --- faiss/utils/approx_topk_hamming/approx_topk_hamming.h | 4 ++-- faiss/utils/hamming.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/faiss/utils/approx_topk_hamming/approx_topk_hamming.h b/faiss/utils/approx_topk_hamming/approx_topk_hamming.h index 40539b361f..9f8d211956 100644 --- a/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +++ b/faiss/utils/approx_topk_hamming/approx_topk_hamming.h @@ -101,7 +101,7 @@ struct HeapWithBucketsForHamming32< uint32_t hamming_distances[8]; uint8_t valid_counter = 0; for (size_t j8 = 0; j8 < 8; j8++) { - const uint32_t idx = + const uint32_t idx = j8 + j * 8 + ip + n_per_beam * beam_index; if (!sel || sel->is_member(idx)) { hamming_distances[j8] = hc.hamming( @@ -172,7 +172,7 @@ struct HeapWithBucketsForHamming32< const auto value = min_distances_scalar[j8]; const auto index = min_indices_scalar[j8]; - if (value < std::numeric_limits::max() && + if (value < std::numeric_limits::max() && C::cmp2(bh_val[0], value, bh_ids[0], index)) { heap_replace_top( k, bh_val, bh_ids, value, index); diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp index 9fd7f1bb84..bc9ddc2816 100644 --- a/faiss/utils/hamming.cpp +++ b/faiss/utils/hamming.cpp @@ -205,7 +205,7 @@ void hammings_knn_hc( NB, \ BD, \ HammingComputer>:: \ - addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_, sel); \ + addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_, sel); \ break; switch (approx_topk_mode) { From d92184edcda177e9fedd8ff719789ce668c16c87 Mon Sep 17 00:00:00 2001 From: Gustav von Zitzewitz Date: Tue, 24 Dec 2024 09:06:45 +0100 Subject: [PATCH 07/14] remove default from definition --- faiss/utils/hamming.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp index bc9ddc2816..a7fc2d3519 100644 --- a/faiss/utils/hamming.cpp +++ b/faiss/utils/hamming.cpp @@ -539,7 +539,7 @@ void hamming_range_search( int radius, size_t code_size, RangeSearchResult* result, - const IDSelector* sel = nullptr) { + const IDSelector* sel) { Run_hamming_range_search r; dispatch_HammingComputer( code_size, r, a, b, na, nb, radius, code_size, result, sel); From 16027c659964891f12786ac947775e62052d3659 Mon Sep 17 00:00:00 2001 From: Gustav von Zitzewitz Date: Tue, 14 Jan 2025 12:39:35 +0530 Subject: [PATCH 08/14] add #include to hamming.cpp --- faiss/utils/hamming.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp index a7fc2d3519..8647103a1c 100644 --- a/faiss/utils/hamming.cpp +++ b/faiss/utils/hamming.cpp @@ -30,6 +30,7 @@ #include #include +#include #include #include #include From 983ae3d46b77df1af94cd19a9425c323de34fef9 Mon Sep 17 00:00:00 2001 From: Gustav von Zitzewitz Date: Tue, 21 Jan 2025 09:54:48 +0530 Subject: [PATCH 09/14] add faiss namespace to IDSelector in hamming.cpp --- faiss/utils/hamming.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp index 8647103a1c..5b81e9f19c 100644 --- a/faiss/utils/hamming.cpp +++ b/faiss/utils/hamming.cpp @@ -173,7 +173,7 @@ void hammings_knn_hc( bool order = true, bool init_heap = true, ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK, - const IDSelector* sel = nullptr) { + const faiss::IDSelector* sel = nullptr) { size_t k = ha->k; if (init_heap) ha->heapify(); @@ -244,7 +244,7 @@ void hammings_knn_mc( size_t k, int32_t* __restrict distances, int64_t* __restrict labels, - const IDSelector* sel) { + const faiss::IDSelector* sel) { const int nBuckets = bytes_per_code * 8 + 1; std::vector all_counters(na * nBuckets, 0); std::unique_ptr all_ids_per_dis(new int64_t[na * nBuckets * k]); @@ -300,7 +300,7 @@ void hamming_range_search( int radius, size_t code_size, RangeSearchResult* res, - const IDSelector* sel) { + const faiss::IDSelector* sel) { #pragma omp parallel { RangeSearchPartialResult pres(res); @@ -501,7 +501,7 @@ void hammings_knn_hc( size_t ncodes, int order, ApproxTopK_mode_t approx_topk_mode, - const IDSelector* sel) { + const faiss::IDSelector* sel) { Run_hammings_knn_hc r; dispatch_HammingComputer( ncodes, @@ -526,7 +526,7 @@ void hammings_knn_mc( size_t ncodes, int32_t* __restrict distances, int64_t* __restrict labels, - const IDSelector* sel) { + const faiss::IDSelector* sel) { Run_hammings_knn_mc r; dispatch_HammingComputer( ncodes, r, ncodes, a, b, na, nb, k, distances, labels, sel); @@ -540,7 +540,7 @@ void hamming_range_search( int radius, size_t code_size, RangeSearchResult* result, - const IDSelector* sel) { + const faiss::IDSelector* sel) { Run_hamming_range_search r; dispatch_HammingComputer( code_size, r, a, b, na, nb, radius, code_size, result, sel); From dfccc698af5fda637cf9bbb2fd47639852a59831 Mon Sep 17 00:00:00 2001 From: Gustav von Zitzewitz Date: Thu, 13 Feb 2025 13:49:03 +0100 Subject: [PATCH 10/14] add faiss:: to IDSelector in hamming.h --- faiss/utils/hamming.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/faiss/utils/hamming.h b/faiss/utils/hamming.h index 49a43512af..3f3f488bc5 100644 --- a/faiss/utils/hamming.h +++ b/faiss/utils/hamming.h @@ -137,7 +137,7 @@ void hammings_knn_hc( size_t ncodes, int ordered, ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK, - const IDSelector* sel = nullptr); + const faiss::IDSelector* sel = nullptr); /* Legacy alias to hammings_knn_hc. */ void hammings_knn( @@ -169,7 +169,7 @@ void hammings_knn_mc( size_t ncodes, int32_t* distances, int64_t* labels, - const IDSelector* sel = nullptr); + const faiss::IDSelector* sel = nullptr); /** same as hammings_knn except we are doing a range search with radius */ void hamming_range_search( @@ -180,7 +180,7 @@ void hamming_range_search( int radius, size_t ncodes, RangeSearchResult* result, - const IDSelector* sel = nullptr); + const faiss::IDSelector* sel = nullptr); /* Counting the number of matches or of cross-matches (without returning them) For use with function that assume pre-allocated memory */ From 56f13ea7a32c2b3de0ff42c6e398eaa3fb313312 Mon Sep 17 00:00:00 2001 From: Gustav von Zitzewitz Date: Fri, 14 Feb 2025 11:05:04 +0100 Subject: [PATCH 11/14] add params to IndexBinary replacement_search and replacement_range_search --- faiss/python/class_wrappers.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/faiss/python/class_wrappers.py b/faiss/python/class_wrappers.py index 46f8b0195f..71756f5522 100644 --- a/faiss/python/class_wrappers.py +++ b/faiss/python/class_wrappers.py @@ -869,7 +869,7 @@ def replacement_reconstruct_n(self, n0=0, ni=-1, x=None): self.reconstruct_n_c(n0, ni, swig_ptr(x)) return x - def replacement_search(self, x, k): + def replacement_search(self, x, k, *, params=None): x = _check_dtype_uint8(x) n, d = x.shape assert d == self.code_size @@ -878,7 +878,8 @@ def replacement_search(self, x, k): labels = np.empty((n, k), dtype=np.int64) self.search_c(n, swig_ptr(x), k, swig_ptr(distances), - swig_ptr(labels)) + swig_ptr(labels), + params=params) return distances, labels def replacement_search_preassigned(self, x, k, Iq, Dq): @@ -906,12 +907,12 @@ def replacement_search_preassigned(self, x, k, Iq, Dq): ) return D, I - def replacement_range_search(self, x, thresh): + def replacement_range_search(self, x, thresh, *, params=None): n, d = x.shape x = _check_dtype_uint8(x) assert d == self.code_size res = RangeSearchResult(n) - self.range_search_c(n, swig_ptr(x), thresh, res) + self.range_search_c(n, swig_ptr(x), thresh, res, params=params) # get pointers and copy them lims = rev_swig_ptr(res.lims, n + 1).copy() nd = int(lims[-1]) From 5b424cb484f32da01a4fd6a7b4e8b4102ce43e20 Mon Sep 17 00:00:00 2001 From: Gustav von Zitzewitz Date: Fri, 28 Feb 2025 08:55:12 +0100 Subject: [PATCH 12/14] update tests --- tests/test_search_params.py | 43 +++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/tests/test_search_params.py b/tests/test_search_params.py index af159d8a99..98a0a56979 100644 --- a/tests/test_search_params.py +++ b/tests/test_search_params.py @@ -45,6 +45,7 @@ def do_test_id_selector( index.use_heap = use_heap # Use smaller radius for Hamming distance base_radius = 4 + is_binary = True else: xb = ds.get_database() xq = ds.get_queries() @@ -52,6 +53,7 @@ def do_test_id_selector( index = faiss.index_factory(d, index_key, mt) index.train(xt) base_radius = float('inf') # Will be set based on results + is_binary = False # reference result if "range" in id_selector_type: @@ -149,15 +151,48 @@ def do_test_id_selector( ) Dnew, Inew = index.search(xq, k, params=params) - np.testing.assert_array_equal(Iref, Inew) - np.testing.assert_almost_equal(Dref, Dnew, decimal=5) + + if is_binary: + # For binary indexes, we need to check: + # 1. All returned IDs are valid (in the subset or -1) + # 2. The distances match + + # Check that all returned IDs are valid + valid_ids = np.ones_like(Inew, dtype=bool) + for i in range(Inew.shape[0]): + for j in range(Inew.shape[1]): + if Inew[i, j] == -1: + continue + valid_ids[i, j] = Inew[i, j] in subset + + self.assertTrue(np.all(valid_ids), "Some returned IDs are not in the subset") + + # Check that distances match + np.testing.assert_almost_equal(Dref, Dnew, decimal=5) + else: + # For non-binary indexes, we can do exact comparison + np.testing.assert_array_equal(Iref, Inew) + np.testing.assert_almost_equal(Dref, Dnew, decimal=5) if have_range_search: Rlims_new, RDnew, RInew = index.range_search(xq, radius, params=params) np.testing.assert_array_equal(Rlims_ref, Rlims_new) RDref, RIref = sort_range_res_2(Rlims_ref, RDref, RIref) - np.testing.assert_array_equal(RIref, RInew) - np.testing.assert_almost_equal(RDref, RDnew, decimal=5) + + if is_binary: + # For binary indexes, check that all returned IDs are valid + valid_ids = np.ones(len(RInew), dtype=bool) + for i, id in enumerate(RInew): + valid_ids[i] = id in subset + + self.assertTrue(np.all(valid_ids), "Some range search IDs are not in the subset") + + # Check that distances match + np.testing.assert_almost_equal(RDref, RDnew, decimal=5) + else: + # For non-binary indexes, we can do exact comparison + np.testing.assert_array_equal(RIref, RInew) + np.testing.assert_almost_equal(RDref, RDnew, decimal=5) def test_IVFFlat(self): self.do_test_id_selector("IVF32,Flat") From 8be85c8bc98f9378665c2164565c5ac32f96e683 Mon Sep 17 00:00:00 2001 From: Gustav von Zitzewitz Date: Fri, 28 Feb 2025 14:21:54 +0100 Subject: [PATCH 13/14] small test optimization --- tests/test_search_params.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/test_search_params.py b/tests/test_search_params.py index 98a0a56979..9360116302 100644 --- a/tests/test_search_params.py +++ b/tests/test_search_params.py @@ -159,11 +159,13 @@ def do_test_id_selector( # Check that all returned IDs are valid valid_ids = np.ones_like(Inew, dtype=bool) - for i in range(Inew.shape[0]): - for j in range(Inew.shape[1]): - if Inew[i, j] == -1: - continue - valid_ids[i, j] = Inew[i, j] in subset + # Create a mask of valid IDs (those in subset) + subset_set = set(subset) # Convert to set for O(1) lookups + # Handle -1 values separately (they're always valid) + valid_ids = np.logical_or( + Inew == -1, + np.isin(Inew, list(subset_set)) + ) self.assertTrue(np.all(valid_ids), "Some returned IDs are not in the subset") @@ -182,8 +184,9 @@ def do_test_id_selector( if is_binary: # For binary indexes, check that all returned IDs are valid valid_ids = np.ones(len(RInew), dtype=bool) - for i, id in enumerate(RInew): - valid_ids[i] = id in subset + # Use vectorized operation instead of loop + subset_set = set(subset) # Convert to set for O(1) lookups + valid_ids = np.isin(RInew, list(subset_set)) self.assertTrue(np.all(valid_ids), "Some range search IDs are not in the subset") From 9d16d52bbb6f06e9b9ca04b4c9bcfbc348255791 Mon Sep 17 00:00:00 2001 From: Gustav von Zitzewitz Date: Tue, 11 Mar 2025 08:48:36 +0100 Subject: [PATCH 14/14] lint --- tests/test_search_params.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/test_search_params.py b/tests/test_search_params.py index 9360116302..56c2cd95ee 100644 --- a/tests/test_search_params.py +++ b/tests/test_search_params.py @@ -23,11 +23,11 @@ class TestSelector(unittest.TestCase): """ def do_test_id_selector( - self, - index_key, - id_selector_type="batch", - mt=faiss.METRIC_L2, - k=10, + self, + index_key, + id_selector_type="batch", + mt=faiss.METRIC_L2, + k=10, use_heap=True ): """ Verify that the id selector returns the subset of results that are @@ -151,12 +151,12 @@ def do_test_id_selector( ) Dnew, Inew = index.search(xq, k, params=params) - + if is_binary: # For binary indexes, we need to check: # 1. All returned IDs are valid (in the subset or -1) # 2. The distances match - + # Check that all returned IDs are valid valid_ids = np.ones_like(Inew, dtype=bool) # Create a mask of valid IDs (those in subset) @@ -166,9 +166,9 @@ def do_test_id_selector( Inew == -1, np.isin(Inew, list(subset_set)) ) - + self.assertTrue(np.all(valid_ids), "Some returned IDs are not in the subset") - + # Check that distances match np.testing.assert_almost_equal(Dref, Dnew, decimal=5) else: @@ -180,16 +180,16 @@ def do_test_id_selector( Rlims_new, RDnew, RInew = index.range_search(xq, radius, params=params) np.testing.assert_array_equal(Rlims_ref, Rlims_new) RDref, RIref = sort_range_res_2(Rlims_ref, RDref, RIref) - + if is_binary: # For binary indexes, check that all returned IDs are valid valid_ids = np.ones(len(RInew), dtype=bool) # Use vectorized operation instead of loop subset_set = set(subset) # Convert to set for O(1) lookups valid_ids = np.isin(RInew, list(subset_set)) - + self.assertTrue(np.all(valid_ids), "Some range search IDs are not in the subset") - + # Check that distances match np.testing.assert_almost_equal(RDref, RDnew, decimal=5) else: