-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
first commit, initializing the repository with format
- Loading branch information
cole_foster@brown.edu
committed
Aug 5, 2024
0 parents
commit 80cee5a
Showing
26 changed files
with
7,309 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
name: CI | ||
|
||
on: | ||
push: | ||
# Sequence of patterns matched against refs/heads | ||
branches: | ||
# Push events on main branch | ||
- main | ||
- master | ||
# Sequence of patterns matched against refs/tags | ||
tags: '*' | ||
|
||
jobs: | ||
test: | ||
name: ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} | ||
runs-on: ${{ matrix.os }} | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
version: | ||
- '1.8' | ||
os: | ||
- ubuntu-latest | ||
arch: | ||
- x64 | ||
exclude: | ||
- os: macOS-latest | ||
arch: x86 | ||
python-version: ["3.8"] | ||
steps: | ||
- uses: actions/checkout@v3 | ||
with: | ||
submodules: 'true' | ||
- name: Set up Python 3.8 | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
- uses: conda-incubator/setup-miniconda@v2 | ||
with: | ||
auto-update-conda: true | ||
python-version: ${{ matrix.python-version }} | ||
- name: Install dependencies | ||
shell: bash -el {0} | ||
run: | | ||
conda create -n hsp python=3.8 | ||
conda activate hsp | ||
conda install matplotlib | ||
pip install h5py | ||
cd hnswlib/ | ||
pip install . | ||
- name: Run benchmark | ||
shell: bash -el {0} | ||
run: | | ||
conda activate hsp | ||
python3 search/search.py | ||
python3 eval/eval.py | ||
python3 eval/plot.py res.csv --size 300K | ||
- uses: actions/upload-artifact@v3 | ||
with: | ||
name: Results on 300k | ||
path: | | ||
res.csv | ||
result_300K.png |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
data/ | ||
result/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# Team HSP Submission to SISAP 2024 Indexing Challenge |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
''' | ||
Cole Foster | ||
July 11th, 2023 | ||
SISAP Indexing Challenge | ||
''' | ||
import argparse | ||
import hnswlib | ||
import h5py | ||
import numpy as np | ||
import os | ||
from pathlib import Path | ||
from urllib.request import urlretrieve | ||
import time | ||
|
||
data_directory = "/users/cfoste18/scratch/datasets/LAION" | ||
|
||
def download(src, dst): | ||
if not os.path.exists(dst): | ||
os.makedirs(Path(dst).parent, exist_ok=True) | ||
print('downloading %s -> %s...' % (src, dst)) | ||
urlretrieve(src, dst) | ||
|
||
def prepare(kind, size): | ||
dataset_base_url = "https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge" | ||
task = { | ||
"query": "http://ingeotec.mx/~sadit/sisap2024-data/public-queries-2024-laion2B-en-clip768v2-n=10k.h5", | ||
"dataset": f"{dataset_base_url}/laion2B-en-{kind}-n={size}.h5", | ||
} | ||
|
||
for version, url in task.items(): | ||
download(url, os.path.join(data_directory, kind, size, f"{version}.h5")) | ||
|
||
def store_results(dst, algo, kind, D, I, buildtime, querytime, params, size): | ||
os.makedirs(Path(dst).parent, exist_ok=True) | ||
f = h5py.File(dst, 'w') | ||
f.attrs['algo'] = algo | ||
f.attrs['data'] = kind | ||
f.attrs['buildtime'] = buildtime | ||
f.attrs['querytime'] = querytime | ||
f.attrs['size'] = size | ||
f.attrs['params'] = params | ||
f.create_dataset('knns', I.shape, dtype=I.dtype)[:] = I | ||
f.create_dataset('dists', D.shape, dtype=D.dtype)[:] = D | ||
f.close() | ||
|
||
|
||
def run(size, M, ef_construction): | ||
kind = "clip768v2" | ||
key = "emb" | ||
print(f"Running HNSW on {kind}-{size}") | ||
index_identifier = f"HNSW-M-{M}-EFC-{ef_construction}" | ||
|
||
#> Download dataset if necessary | ||
prepare(kind, size) | ||
D=768 | ||
|
||
#> Initialize the HNSW index | ||
index = hnswlib.Index(space='ip', dim=D) # possible options are l2, cosine or ip | ||
|
||
#> Load the dataset: | ||
start_time = time.time() | ||
with h5py.File(os.path.join(data_directory, kind, size, "dataset.h5"), 'r') as f: | ||
dataset = f[key] | ||
N,DD = dataset.shape | ||
print(f'Datset has N={N} rows and D={DD} columns') | ||
index.init_index(max_elements=N, ef_construction=ef_construction, M=M, random_seed=10) | ||
|
||
# determine number of rows | ||
total_rows = dataset.shape[0] | ||
chunk_size = 100000 | ||
|
||
# iterate over the dataset, add each chunk | ||
for start_index in range(0, total_rows, chunk_size): | ||
end_index = min(start_index + chunk_size, total_rows) | ||
|
||
# load this chunk into memory | ||
data_chunk = dataset[start_index:end_index] | ||
|
||
# add it to hnsw index | ||
index.add_items(data_chunk) | ||
build_time = time.time() - start_time | ||
print(f"Done constructing index in {build_time:.4} (s)") | ||
|
||
# get the queries | ||
queries = np.array(h5py.File(os.path.join(data_directory, kind, size, "query.h5"), "r")[key],dtype=np.float32) | ||
|
||
#> Searching on the index | ||
ef_vec = [30, 50, 70, 100, 140, 190, 250, 320, 400, 500, 650, 800, 1000, 1200, 1500, 1800, 2100, 2500, 3000] | ||
for ef in ef_vec: | ||
print(f"Searching with ef={ef}") | ||
start = time.time() | ||
index.set_ef(ef) # ef should always be > k | ||
labels, distances = index.knn_query(queries, k=30) | ||
search_time = time.time() - start | ||
print(f"Done searching in {search_time:.4}s.") | ||
|
||
# save the results | ||
labels = labels + 1 # FAISS is 0-indexed, groundtruth is 1-indexed | ||
identifier = f"index=({index_identifier}),query=(ef={ef})" | ||
store_results(os.path.join("result/", kind, size, f"{identifier}.h5"), index_identifier, kind, distances, labels, build_time, search_time, identifier, size) | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"--size", | ||
type=str, | ||
default="300K" | ||
) | ||
parser.add_argument( | ||
"-M", | ||
type=int, | ||
default=20, | ||
) | ||
parser.add_argument( | ||
"-E", | ||
type=int, | ||
default=100, | ||
) | ||
args = parser.parse_args() | ||
assert args.size in ["300K", "10M", "100M"] | ||
|
||
print("Running Script With:") | ||
print(f" * N={args.size}") | ||
print(f" * M={args.M} | HNSW Parameter M") | ||
print(f" * EFC={args.E} | HNSW Parameter ef_construction") | ||
run(args.size, args.M, args.E) | ||
print(f"Done! Have a good day!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
''' | ||
Cole Foster | ||
August 4th, 2024 | ||
SISAP 2024 Indexing Challenge | ||
''' | ||
import argparse | ||
import GraphHierarchy | ||
import h5py | ||
import numpy as np | ||
import os | ||
from pathlib import Path | ||
from urllib.request import urlretrieve | ||
import time | ||
|
||
|
||
def run(size, M, ef_construction): | ||
|
||
#> Initialize the HNSW index | ||
index = GraphHierarchy.Index(space='ip', dim=768) # possible options are l2, cosine or ip | ||
|
||
|
||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"--size", | ||
type=str, | ||
default="300K" | ||
) | ||
parser.add_argument( | ||
"-M", | ||
type=int, | ||
default=20, | ||
) | ||
parser.add_argument( | ||
"-E", | ||
type=int, | ||
default=100, | ||
) | ||
args = parser.parse_args() | ||
assert args.size in ["300K", "10M", "100M"] | ||
|
||
print("Running Script With:") | ||
print(f" * N={args.size}") | ||
print(f" * M={args.M} | HNSW Parameter M") | ||
print(f" * EFC={args.E} | HNSW Parameter ef_construction") | ||
run(args.size, args.M, args.E) | ||
print(f"Done! Have a good day!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
hnswlib.egg-info/ | ||
build/ | ||
dist/ | ||
tmp/ | ||
python_bindings/tests/__pycache__/ | ||
*.pyd | ||
hnswlib.cpython*.so | ||
var/ | ||
.idea/ | ||
.vscode/ | ||
.vs/ | ||
**.DS_Store | ||
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
cmake_minimum_required(VERSION 3.0...3.26) | ||
|
||
project(hnswlib | ||
LANGUAGES CXX) | ||
|
||
include(GNUInstallDirs) | ||
include(CheckCXXCompilerFlag) | ||
|
||
add_library(hnswlib INTERFACE) | ||
add_library(hnswlib::hnswlib ALIAS hnswlib) | ||
|
||
target_include_directories(hnswlib INTERFACE | ||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}> | ||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>) | ||
|
||
# Install | ||
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/hnswlib | ||
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | ||
|
||
install(TARGETS hnswlib | ||
EXPORT hnswlibTargets) | ||
|
||
install(EXPORT hnswlibTargets | ||
FILE hnswlibConfig.cmake | ||
NAMESPACE hnswlib:: | ||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hnswlib) | ||
|
||
# Examples and tests | ||
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) | ||
option(HNSWLIB_EXAMPLES "Build examples and tests." ON) | ||
else() | ||
option(HNSWLIB_EXAMPLES "Build examples and tests." OFF) | ||
endif() | ||
if(HNSWLIB_EXAMPLES) | ||
set(CMAKE_CXX_STANDARD 11) | ||
|
||
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") | ||
SET( CMAKE_CXX_FLAGS "-Ofast -std=c++11 -DHAVE_CXX0X -openmp -fpic -ftree-vectorize" ) | ||
check_cxx_compiler_flag("-march=native" COMPILER_SUPPORT_NATIVE_FLAG) | ||
if(COMPILER_SUPPORT_NATIVE_FLAG) | ||
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native" ) | ||
message("set -march=native flag") | ||
else() | ||
check_cxx_compiler_flag("-mcpu=apple-m1" COMPILER_SUPPORT_M1_FLAG) | ||
if(COMPILER_SUPPORT_M1_FLAG) | ||
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=apple-m1" ) | ||
message("set -mcpu=apple-m1 flag") | ||
endif() | ||
endif() | ||
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") | ||
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" ) | ||
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") | ||
SET( CMAKE_CXX_FLAGS "/O2 -DHAVE_CXX0X /W1 /openmp /EHsc" ) | ||
endif() | ||
|
||
# examples | ||
add_executable(example_search examples/cpp/example_search.cpp) | ||
target_link_libraries(example_search hnswlib) | ||
|
||
add_executable(example_epsilon_search examples/cpp/example_epsilon_search.cpp) | ||
target_link_libraries(example_epsilon_search hnswlib) | ||
|
||
add_executable(example_multivector_search examples/cpp/example_multivector_search.cpp) | ||
target_link_libraries(example_multivector_search hnswlib) | ||
|
||
add_executable(example_filter examples/cpp/example_filter.cpp) | ||
target_link_libraries(example_filter hnswlib) | ||
|
||
add_executable(example_replace_deleted examples/cpp/example_replace_deleted.cpp) | ||
target_link_libraries(example_replace_deleted hnswlib) | ||
|
||
add_executable(example_mt_search examples/cpp/example_mt_search.cpp) | ||
target_link_libraries(example_mt_search hnswlib) | ||
|
||
add_executable(example_mt_filter examples/cpp/example_mt_filter.cpp) | ||
target_link_libraries(example_mt_filter hnswlib) | ||
|
||
add_executable(example_mt_replace_deleted examples/cpp/example_mt_replace_deleted.cpp) | ||
target_link_libraries(example_mt_replace_deleted hnswlib) | ||
|
||
# tests | ||
add_executable(multivector_search_test tests/cpp/multivector_search_test.cpp) | ||
target_link_libraries(multivector_search_test hnswlib) | ||
|
||
add_executable(epsilon_search_test tests/cpp/epsilon_search_test.cpp) | ||
target_link_libraries(epsilon_search_test hnswlib) | ||
|
||
add_executable(test_updates tests/cpp/updates_test.cpp) | ||
target_link_libraries(test_updates hnswlib) | ||
|
||
add_executable(searchKnnCloserFirst_test tests/cpp/searchKnnCloserFirst_test.cpp) | ||
target_link_libraries(searchKnnCloserFirst_test hnswlib) | ||
|
||
add_executable(searchKnnWithFilter_test tests/cpp/searchKnnWithFilter_test.cpp) | ||
target_link_libraries(searchKnnWithFilter_test hnswlib) | ||
|
||
add_executable(multiThreadLoad_test tests/cpp/multiThreadLoad_test.cpp) | ||
target_link_libraries(multiThreadLoad_test hnswlib) | ||
|
||
add_executable(multiThread_replace_test tests/cpp/multiThread_replace_test.cpp) | ||
target_link_libraries(multiThread_replace_test hnswlib) | ||
|
||
add_executable(main tests/cpp/main.cpp tests/cpp/sift_1b.cpp) | ||
target_link_libraries(main hnswlib) | ||
endif() |
Oops, something went wrong.