diff --git a/benchs/bench_fw/benchmark.py b/benchs/bench_fw/benchmark.py index 835c73bacc..42c6b6ffa6 100644 --- a/benchs/bench_fw/benchmark.py +++ b/benchs/bench_fw/benchmark.py @@ -9,7 +9,7 @@ from statistics import mean, median from typing import Any, Dict, List, Optional -import faiss # @manual=//faiss/python:pyfaiss_gpu +import faiss # @manual=//faiss/python:pyfaiss import numpy as np @@ -214,6 +214,7 @@ def set_io(self, benchmark_io: BenchmarkIO): @dataclass class TrainOperator(IndexOperator): codec_descs: List[CodecDescriptor] = field(default_factory=lambda: []) + assemble_opaque: bool = True def get_desc(self, name: str) -> Optional[CodecDescriptor]: for desc in self.codec_descs: @@ -248,6 +249,7 @@ def build_index_wrapper(self, codec_desc: CodecDescriptor): factory=codec_desc.factory, training_vectors=codec_desc.training_vectors, codec_name=codec_desc.get_name(), + assemble_opaque=self.assemble_opaque, ) index.set_io(self.io) codec_desc.index = index diff --git a/benchs/bench_fw/benchmark_io.py b/benchs/bench_fw/benchmark_io.py index 79b0fd09c4..a67b09cb34 100644 --- a/benchs/bench_fw/benchmark_io.py +++ b/benchs/bench_fw/benchmark_io.py @@ -13,11 +13,11 @@ from typing import Any, Dict, List, Optional from zipfile import ZipFile -import faiss # @manual=//faiss/python:pyfaiss_gpu +import faiss # @manual=//faiss/python:pyfaiss import numpy as np import submitit -from faiss.contrib.datasets import ( # @manual=//faiss/contrib:faiss_contrib_gpu +from faiss.contrib.datasets import ( # @manual=//faiss/contrib:faiss_contrib dataset_from_name, ) diff --git a/benchs/bench_fw/descriptors.py b/benchs/bench_fw/descriptors.py index b200b1be12..747612c09e 100644 --- a/benchs/bench_fw/descriptors.py +++ b/benchs/bench_fw/descriptors.py @@ -8,7 +8,7 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional -import faiss # @manual=//faiss/python:pyfaiss_gpu +import faiss # @manual=//faiss/python:pyfaiss from .benchmark_io import BenchmarkIO from .utils import timer @@ -16,6 +16,9 @@ logger = logging.getLogger(__name__) +# Important: filenames end with . without extension (npy, codec, index), +# when writing files, you are required to filename + "npy" etc. + @dataclass class IndexDescriptorClassic: bucket: Optional[str] = None @@ -110,21 +113,25 @@ def get_filename( filename += "." return filename + def get_kmeans_filename(self, k): + return f"{self.get_filename()}kmeans_{k}." + def k_means(self, io, k, dry_run): logger.info(f"k_means {k} {self}") kmeans_vectors = DatasetDescriptor( - tablename=f"{self.get_filename()}kmeans_{k}.npy" + tablename=f"{self.get_filename()}kmeans_{k}" ) - meta_filename = kmeans_vectors.tablename + ".json" - if not io.file_exist(kmeans_vectors.tablename) or not io.file_exist( + kmeans_filename = kmeans_vectors.get_filename() + "npy" + meta_filename = kmeans_vectors.get_filename() + "json" + if not io.file_exist(kmeans_filename) or not io.file_exist( meta_filename ): if dry_run: - return None, None, kmeans_vectors.tablename + return None, None, kmeans_filename x = io.get_dataset(self) kmeans = faiss.Kmeans(d=x.shape[1], k=k, gpu=True) _, t, _ = timer("k_means", lambda: kmeans.train(x)) - io.write_nparray(kmeans.centroids, kmeans_vectors.tablename) + io.write_nparray(kmeans.centroids, kmeans_filename) io.write_json({"k_means_time": t}, meta_filename) else: t = io.read_json(meta_filename)["k_means_time"] diff --git a/benchs/bench_fw/index.py b/benchs/bench_fw/index.py index 8fa2c69b13..c3c17a91f5 100644 --- a/benchs/bench_fw/index.py +++ b/benchs/bench_fw/index.py @@ -11,18 +11,18 @@ from dataclasses import dataclass from typing import ClassVar, Dict, List, Optional -import faiss # @manual=//faiss/python:pyfaiss_gpu +import faiss # @manual=//faiss/python:pyfaiss import numpy as np from faiss.benchs.bench_fw.descriptors import IndexBaseDescriptor -from faiss.contrib.evaluation import ( # @manual=//faiss/contrib:faiss_contrib_gpu +from faiss.contrib.evaluation import ( # @manual=//faiss/contrib:faiss_contrib knn_intersection_measure, OperatingPointsWithRanges, ) -from faiss.contrib.factory_tools import ( # @manual=//faiss/contrib:faiss_contrib_gpu +from faiss.contrib.factory_tools import ( # @manual=//faiss/contrib:faiss_contrib reverse_index_factory, ) -from faiss.contrib.ivf_tools import ( # @manual=//faiss/contrib:faiss_contrib_gpu +from faiss.contrib.ivf_tools import ( # @manual=//faiss/contrib:faiss_contrib add_preassigned, replace_ivf_quantizer, ) @@ -635,11 +635,12 @@ def get_index_name(self) -> Optional[str]: def fetch_index(self): # read index from file if it is already available + index_filename = None if self.index_path: index_filename = os.path.basename(self.index_path) - else: + elif self.index_name: index_filename = self.index_name + "index" - if self.io.file_exist(index_filename): + if index_filename and self.io.file_exist(index_filename): if self.index_path: index = self.io.read_index( index_filename, @@ -681,7 +682,7 @@ def fetch_index(self): ) assert index.ntotal == xb.shape[0] or index_ivf.ntotal == xb.shape[0] logger.info("Added vectors to index") - if self.serialize_full_index: + if self.serialize_full_index and index_filename: codec_size = self.io.write_index(index, index_filename) assert codec_size is not None @@ -908,6 +909,7 @@ def get_codec(self): class IndexFromFactory(Index): factory: Optional[str] = None training_vectors: Optional[DatasetDescriptor] = None + assemble_opaque: bool = True def __post_init__(self): super().__post_init__() @@ -916,6 +918,19 @@ def __post_init__(self): if self.factory != "Flat" and self.training_vectors is None: raise ValueError(f"training_vectors is not set for {self.factory}") + def get_codec_name(self): + codec_name = super().get_codec_name() + if codec_name is None: + codec_name = f"{self.factory.replace(',', '_')}." + codec_name += f"d_{self.d}.{self.metric.upper()}." + if self.factory != "Flat": + assert self.training_vectors is not None + codec_name += self.training_vectors.get_filename("xt") + if self.construction_params is not None: + codec_name += IndexBaseDescriptor.param_dict_list_to_name(self.construction_params) + self.codec_name = codec_name + return self.codec_name + def fetch_meta(self, dry_run=False): meta_filename = self.get_codec_name() + "json" if self.io.file_exist(meta_filename): @@ -1021,14 +1036,13 @@ def get_quantizer(self, dry_run, pretransform=None): def assemble(self, dry_run): logger.info(f"assemble {self.factory}") model = self.get_model() - opaque = True t_aggregate = 0 # try: # reverse_index_factory(model) # opaque = False # except NotImplementedError: # opaque = True - if opaque: + if self.assemble_opaque: codec = model else: if isinstance(model, faiss.IndexPreTransform): diff --git a/benchs/bench_fw/optimize.py b/benchs/bench_fw/optimize.py index ac6c45ab0c..1357d556c9 100644 --- a/benchs/bench_fw/optimize.py +++ b/benchs/bench_fw/optimize.py @@ -7,9 +7,9 @@ from dataclasses import dataclass from typing import Dict, List, Tuple -import faiss # @manual=//faiss/python:pyfaiss_gpu +import faiss # @manual=//faiss/python:pyfaiss -# from faiss.contrib.evaluation import ( # @manual=//faiss/contrib:faiss_contrib_gpu +# from faiss.contrib.evaluation import ( # @manual=//faiss/contrib:faiss_contrib # OperatingPoints, # ) diff --git a/benchs/bench_fw/utils.py b/benchs/bench_fw/utils.py index 3151c0c2da..b21e8bbd7c 100644 --- a/benchs/bench_fw/utils.py +++ b/benchs/bench_fw/utils.py @@ -9,10 +9,10 @@ from multiprocessing.pool import ThreadPool from time import perf_counter -import faiss # @manual=//faiss/python:pyfaiss_gpu +import faiss # @manual=//faiss/python:pyfaiss import numpy as np -from faiss.contrib.evaluation import ( # @manual=//faiss/contrib:faiss_contrib_gpu +from faiss.contrib.evaluation import ( # @manual=//faiss/contrib:faiss_contrib OperatingPoints, )