Skip to content

Commit b741509

Browse files
kuarorafacebook-github-bot
authored andcommitted
Support of skip_ids in merge_from_multiple function of OnDiskInvertedLists (#3327)
Summary: **Context** 1. [Issue 2621](#2621) discuss inconsistency between OnDiskInvertedList and InvertedList. OnDiskInvertedList is supposed to handle disk based multiple Index Shards. Thus, we should name it differently when merging invls from index shard. 2. [Issue 2876](#2876) provides usecase of shifting ids when merging invls from different shards. **In this diff**, 1. To address #1 above, I renamed the merge_from function to merge_from_multiple without touching merge_from base class. why so? To continue to allow merge invl from one index to ondiskinvl from other index. 2. To address #2 above, I have added support of shift_ids in merge_from_multiple to shift ids from different shards. This can be used when each shard has same set of ids but different data. This is not recommended if id is already unique across shards. Differential Revision: D55482518
1 parent 03db694 commit b741509

File tree

4 files changed

+49
-8
lines changed

4 files changed

+49
-8
lines changed

faiss/invlists/InvertedLists.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ struct InvertedLists {
129129
* high level functions */
130130

131131
/// move all entries from oivf (empty on output)
132-
void merge_from(InvertedLists* oivf, size_t add_id);
132+
virtual void merge_from(InvertedLists* oivf, size_t add_id);
133133

134134
// how to copy a subset of elements from the inverted lists
135135
// This depends on two integers, a1 and a2.

faiss/invlists/OnDiskInvertedLists.cpp

+20-4
Original file line numberDiff line numberDiff line change
@@ -565,22 +565,28 @@ void OnDiskInvertedLists::free_slot(size_t offset, size_t capacity) {
565565
/*****************************************
566566
* Compact form
567567
*****************************************/
568-
569-
size_t OnDiskInvertedLists::merge_from(
568+
size_t OnDiskInvertedLists::merge_from_multiple(
570569
const InvertedLists** ils,
571570
int n_il,
571+
bool shift_ids,
572572
bool verbose) {
573573
FAISS_THROW_IF_NOT_MSG(
574574
totsize == 0, "works only on an empty InvertedLists");
575575

576576
std::vector<size_t> sizes(nlist);
577+
std::vector<size_t> shift_id_offsets(n_il);
577578
for (int i = 0; i < n_il; i++) {
578579
const InvertedLists* il = ils[i];
579580
FAISS_THROW_IF_NOT(il->nlist == nlist && il->code_size == code_size);
580581

582+
size_t il_totsize = 0;
581583
for (size_t j = 0; j < nlist; j++) {
582584
sizes[j] += il->list_size(j);
585+
il_totsize += il->list_size(j);
583586
}
587+
588+
shift_id_offsets[i] =
589+
(shift_ids && i > 0) ? shift_id_offsets[i - 1] + il_totsize : 0;
584590
}
585591

586592
size_t cums = 0;
@@ -605,11 +611,21 @@ size_t OnDiskInvertedLists::merge_from(
605611
const InvertedLists* il = ils[i];
606612
size_t n_entry = il->list_size(j);
607613
l.size += n_entry;
614+
ScopedIds scope_ids(il, j);
615+
const idx_t* scope_ids_data = scope_ids.get();
616+
std::vector<idx_t> new_ids;
617+
if (shift_ids) {
618+
new_ids.resize(n_entry);
619+
for (size_t k = 0; k < n_entry; k++) {
620+
new_ids[k] = scope_ids[k] + shift_id_offsets[i];
621+
}
622+
scope_ids_data = new_ids.data();
623+
}
608624
update_entries(
609625
j,
610626
l.size - n_entry,
611627
n_entry,
612-
ScopedIds(il, j).get(),
628+
scope_ids_data,
613629
ScopedCodes(il, j).get());
614630
}
615631
assert(l.size == l.capacity);
@@ -638,7 +654,7 @@ size_t OnDiskInvertedLists::merge_from(
638654
size_t OnDiskInvertedLists::merge_from_1(
639655
const InvertedLists* ils,
640656
bool verbose) {
641-
return merge_from(&ils, 1, verbose);
657+
return merge_from_multiple(&ils, 1, verbose);
642658
}
643659

644660
void OnDiskInvertedLists::crop_invlists(size_t l0, size_t l1) {

faiss/invlists/OnDiskInvertedLists.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,10 @@ struct OnDiskInvertedLists : InvertedLists {
101101

102102
// copy all inverted lists into *this, in compact form (without
103103
// allocating slots)
104-
size_t merge_from(
104+
size_t merge_from_multiple(
105105
const InvertedLists** ils,
106106
int n_il,
107+
bool shift_ids = false,
107108
bool verbose = false);
108109

109110
/// same as merge_from for a single invlist

tests/test_merge.cpp

+26-2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ size_t nq = 100;
3232
int nindex = 4;
3333
int k = 10;
3434
int nlist = 40;
35+
int shard_size = nb / nindex;
3536

3637
struct CommonData {
3738
std::vector<float> database;
@@ -100,7 +101,7 @@ int compare_merged(
100101
auto il = new faiss::OnDiskInvertedLists(
101102
index0->nlist, index0->code_size, filename.c_str());
102103

103-
il->merge_from(lists.data(), lists.size());
104+
il->merge_from_multiple(lists.data(), lists.size(), shift_ids);
104105

105106
index0->replace_invlists(il, true);
106107
index0->ntotal = ntotal;
@@ -110,11 +111,14 @@ int compare_merged(
110111
nq, cd.queries.data(), k, newD.data(), newI.data());
111112

112113
size_t ndiff = 0;
114+
bool adjust_ids = shift_ids && !standard_merge;
113115
for (size_t i = 0; i < k * nq; i++) {
114-
if (refI[i] != newI[i]) {
116+
idx_t new_id = adjust_ids ? refI[i] % shard_size : refI[i];
117+
if (refI[i] != new_id) {
115118
ndiff++;
116119
}
117120
}
121+
118122
return ndiff;
119123
}
120124

@@ -220,3 +224,23 @@ TEST(MERGE, merge_flat_ondisk_2) {
220224
int ndiff = compare_merged(&index_shards, false, false);
221225
EXPECT_GE(0, ndiff);
222226
}
227+
228+
// now use ondisk specific merge and use shift ids
229+
TEST(MERGE, merge_flat_ondisk_3) {
230+
faiss::IndexShards index_shards(d, false, false);
231+
index_shards.own_indices = true;
232+
233+
std::vector<idx_t> ids;
234+
for (int i = 0; i < nb; ++i) {
235+
int id = i % shard_size;
236+
ids.push_back(id);
237+
}
238+
for (int i = 0; i < nindex; i++) {
239+
index_shards.add_shard(
240+
new faiss::IndexIVFFlat(&cd.quantizer, d, nlist));
241+
}
242+
EXPECT_TRUE(index_shards.is_trained);
243+
index_shards.add_with_ids(nb, cd.database.data(), ids.data());
244+
int ndiff = compare_merged(&index_shards, true, false);
245+
EXPECT_GE(0, ndiff);
246+
}

0 commit comments

Comments
 (0)