Skip to content

Commit

Permalink
index construction successful
Browse files Browse the repository at this point in the history
  • Loading branch information
cole_foster@brown.edu committed Aug 5, 2024
1 parent f571051 commit eef28dc
Show file tree
Hide file tree
Showing 8 changed files with 1,193 additions and 2,044 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
data/
result/
dev/
slurm*
run.sh
90 changes: 82 additions & 8 deletions search/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,83 @@
from urllib.request import urlretrieve
import time

data_directory = "/users/cfoste18/scratch/datasets/LAION"

def run(size, M, ef_construction):
def download(src, dst):
if not os.path.exists(dst):
os.makedirs(Path(dst).parent, exist_ok=True)
print('downloading %s -> %s...' % (src, dst))
urlretrieve(src, dst)

def prepare(kind, size):
dataset_base_url = "https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge"
task = {
"query": "http://ingeotec.mx/~sadit/sisap2024-data/public-queries-2024-laion2B-en-clip768v2-n=10k.h5",
"dataset": f"{dataset_base_url}/laion2B-en-{kind}-n={size}.h5",
}

for version, url in task.items():
download(url, os.path.join(data_directory, kind, size, f"{version}.h5"))

def store_results(dst, algo, kind, D, I, buildtime, querytime, params, size):
os.makedirs(Path(dst).parent, exist_ok=True)
f = h5py.File(dst, 'w')
f.attrs['algo'] = algo
f.attrs['data'] = kind
f.attrs['buildtime'] = buildtime
f.attrs['querytime'] = querytime
f.attrs['size'] = size
f.attrs['params'] = params
f.create_dataset('knns', I.shape, dtype=I.dtype)[:] = I
f.create_dataset('dists', D.shape, dtype=D.dtype)[:] = D
f.close()


def run(size, scaling, partitions):
kind = "clip768v2"
key = "emb"
print(f"Running HNSW on {kind}-{size}")
index_identifier = f"HNSW-s-{scaling}-p-{partitions}"

#> Download dataset if necessary
prepare(kind, size)
D=768

#>
scaling = 10
max_neighbors = 32

#> Initialize the HNSW index
index = GraphHierarchy.Index(space='ip', dim=768) # possible options are l2, cosine or ip
index = GraphHierarchy.Index(space='ip', dim=D) # possible options are l2, cosine or ip

#> Load the dataset:
start_time = time.time()
with h5py.File(os.path.join(data_directory, kind, size, "dataset.h5"), 'r') as f:
dataset = f[key]
N,DD = dataset.shape
print(f'Datset has N={N} rows and D={DD} columns')
index.init_index(max_elements=N, scaling=scaling, max_neighbors=max_neighbors, random_seed=10)
print(" * it init!")

# determine number of rows
total_rows = dataset.shape[0]
chunk_size = 100000

# iterate over the dataset, add each chunk
for start_index in range(0, total_rows, chunk_size):
end_index = min(start_index + chunk_size, total_rows)

# load this chunk into memory
data_chunk = dataset[start_index:end_index]

# add it to hnsw index
index.add_items(data_chunk)
print(f" * done adding items {time.time() - start_time:.4} (s)")

# construct
index.build(partitions)
build_time = time.time() - start_time
print(f"Done Constructing Index in {build_time:.4f} (s)")



Expand All @@ -30,12 +102,14 @@ def run(size, M, ef_construction):
default="300K"
)
parser.add_argument(
"-M",
"-s",
"--scaling",
type=int,
default=20,
default=10,
)
parser.add_argument(
"-E",
"-p",
"--partitions",
type=int,
default=100,
)
Expand All @@ -44,7 +118,7 @@ def run(size, M, ef_construction):

print("Running Script With:")
print(f" * N={args.size}")
print(f" * M={args.M} | HNSW Parameter M")
print(f" * EFC={args.E} | HNSW Parameter ef_construction")
run(args.size, args.M, args.E)
print(f" * s={args.scaling}")
print(f" * p={args.partitions} ")
run(args.size, args.scaling, args.partitions)
print(f"Done! Have a good day!")
1 change: 1 addition & 0 deletions src/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
hnswlib.egg-info/
GraphHierarchy.egg-info/
build/
dist/
tmp/
Expand Down
5 changes: 5 additions & 0 deletions src/GraphHierarchy/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
This work is an adaptaion of [hnswlib](https://github.com/nmslib/hnswlib.git) by Yury Malkov.

The library is extremely well optimized, and it would be foolish not to leverage this. This
submission takes the skeleton of hnswlib and adapas it to our needs.

Loading

0 comments on commit eef28dc

Please sign in to comment.