@@ -227,64 +227,6 @@ def _iterate_transformed(self, ds, start, batch_size, dt):
227
227
for buffer in ds .iterate (start , batch_size , dt ):
228
228
yield buffer
229
229
230
- def index_shard_and_quantize (self ):
231
- assert os .path .exists (self .index_template_file )
232
- index = faiss .read_index (self .index_template_file )
233
- index_ivf = faiss .downcast_index (faiss .extract_index_ivf (index ))
234
- assert self .nprobe <= index_ivf .quantizer .ntotal , (
235
- f"the number of vectors { index_ivf .quantizer .ntotal } is not enough"
236
- f" to retrieve { self .nprobe } neighbours, check."
237
- )
238
-
239
- if is_pretransform_index (index ):
240
- d = index .chain .at (0 ).d_out
241
- else :
242
- d = self .input_d
243
- for i in range (0 , self .nshards ):
244
- sfn = f"{ self .index_shard_prefix } { i } "
245
- cqfn = f"{ self .coarse_quantization_prefix } { i } " # fixme
246
- if os .path .exists (sfn ) or os .path .exists (cqfn ):
247
- logging .info (f"skipping shard: { i } " )
248
- continue
249
- try :
250
- with open (cqfn , "xb" ) as cqf :
251
- index .reset ()
252
- start = i * self .shard_size
253
- j = 0
254
- quantizer = faiss .index_cpu_to_all_gpus (
255
- index_ivf .quantizer
256
- )
257
- for xb_j in tqdm (
258
- self ._iterate_transformed (
259
- self .xb_ds ,
260
- start ,
261
- EMBEDDINGS_BATCH_SIZE ,
262
- np .float32 ,
263
- ),
264
- file = sys .stdout ,
265
- ):
266
- assert xb_j .shape [1 ] == d
267
- _ , I = quantizer .search (xb_j , self .nprobe )
268
- assert np .amin (I ) >= 0 , f"{ I } "
269
- assert np .amax (I ) < index_ivf .nlist
270
- cqf .write (I )
271
- self ._index_add_core_wrapper ( # fixme
272
- index_ivf ,
273
- xb_j ,
274
- np .arange (start + j , start + j + xb_j .shape [0 ]),
275
- I [:, 0 ],
276
- )
277
- j += xb_j .shape [0 ]
278
- assert j <= self .shard_size
279
- if j == self .shard_size :
280
- break
281
- logging .info (f"writing { sfn } ..." )
282
- faiss .write_index (index , sfn )
283
- except FileExistsError :
284
- logging .info (f"skipping shard: { i } " )
285
- continue
286
- logging .info ("done" )
287
-
288
230
def index_shard (self ):
289
231
assert os .path .exists (self .index_template_file )
290
232
index = faiss .read_index (self .index_template_file )
0 commit comments