Skip to content

Commit

Permalink
first commit, initializing the repository with format
Browse files Browse the repository at this point in the history
  • Loading branch information
cole_foster@brown.edu committed Aug 5, 2024
0 parents commit 80cee5a
Show file tree
Hide file tree
Showing 26 changed files with 7,309 additions and 0 deletions.
63 changes: 63 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: CI

on:
push:
# Sequence of patterns matched against refs/heads
branches:
# Push events on main branch
- main
- master
# Sequence of patterns matched against refs/tags
tags: '*'

jobs:
test:
name: ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
version:
- '1.8'
os:
- ubuntu-latest
arch:
- x64
exclude:
- os: macOS-latest
arch: x86
python-version: ["3.8"]
steps:
- uses: actions/checkout@v3
with:
submodules: 'true'
- name: Set up Python 3.8
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
python-version: ${{ matrix.python-version }}
- name: Install dependencies
shell: bash -el {0}
run: |
conda create -n hsp python=3.8
conda activate hsp
conda install matplotlib
pip install h5py
cd hnswlib/
pip install .
- name: Run benchmark
shell: bash -el {0}
run: |
conda activate hsp
python3 search/search.py
python3 eval/eval.py
python3 eval/plot.py res.csv --size 300K
- uses: actions/upload-artifact@v3
with:
name: Results on 300k
path: |
res.csv
result_300K.png
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
data/
result/
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Team HSP Submission to SISAP 2024 Indexing Challenge
130 changes: 130 additions & 0 deletions search/search-hnsw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
'''
Cole Foster
July 11th, 2023
SISAP Indexing Challenge
'''
import argparse
import hnswlib
import h5py
import numpy as np
import os
from pathlib import Path
from urllib.request import urlretrieve
import time

data_directory = "/users/cfoste18/scratch/datasets/LAION"

def download(src, dst):
if not os.path.exists(dst):
os.makedirs(Path(dst).parent, exist_ok=True)
print('downloading %s -> %s...' % (src, dst))
urlretrieve(src, dst)

def prepare(kind, size):
dataset_base_url = "https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge"
task = {
"query": "http://ingeotec.mx/~sadit/sisap2024-data/public-queries-2024-laion2B-en-clip768v2-n=10k.h5",
"dataset": f"{dataset_base_url}/laion2B-en-{kind}-n={size}.h5",
}

for version, url in task.items():
download(url, os.path.join(data_directory, kind, size, f"{version}.h5"))

def store_results(dst, algo, kind, D, I, buildtime, querytime, params, size):
os.makedirs(Path(dst).parent, exist_ok=True)
f = h5py.File(dst, 'w')
f.attrs['algo'] = algo
f.attrs['data'] = kind
f.attrs['buildtime'] = buildtime
f.attrs['querytime'] = querytime
f.attrs['size'] = size
f.attrs['params'] = params
f.create_dataset('knns', I.shape, dtype=I.dtype)[:] = I
f.create_dataset('dists', D.shape, dtype=D.dtype)[:] = D
f.close()


def run(size, M, ef_construction):
kind = "clip768v2"
key = "emb"
print(f"Running HNSW on {kind}-{size}")
index_identifier = f"HNSW-M-{M}-EFC-{ef_construction}"

#> Download dataset if necessary
prepare(kind, size)
D=768

#> Initialize the HNSW index
index = hnswlib.Index(space='ip', dim=D) # possible options are l2, cosine or ip

#> Load the dataset:
start_time = time.time()
with h5py.File(os.path.join(data_directory, kind, size, "dataset.h5"), 'r') as f:
dataset = f[key]
N,DD = dataset.shape
print(f'Datset has N={N} rows and D={DD} columns')
index.init_index(max_elements=N, ef_construction=ef_construction, M=M, random_seed=10)

# determine number of rows
total_rows = dataset.shape[0]
chunk_size = 100000

# iterate over the dataset, add each chunk
for start_index in range(0, total_rows, chunk_size):
end_index = min(start_index + chunk_size, total_rows)

# load this chunk into memory
data_chunk = dataset[start_index:end_index]

# add it to hnsw index
index.add_items(data_chunk)
build_time = time.time() - start_time
print(f"Done constructing index in {build_time:.4} (s)")

# get the queries
queries = np.array(h5py.File(os.path.join(data_directory, kind, size, "query.h5"), "r")[key],dtype=np.float32)

#> Searching on the index
ef_vec = [30, 50, 70, 100, 140, 190, 250, 320, 400, 500, 650, 800, 1000, 1200, 1500, 1800, 2100, 2500, 3000]
for ef in ef_vec:
print(f"Searching with ef={ef}")
start = time.time()
index.set_ef(ef) # ef should always be > k
labels, distances = index.knn_query(queries, k=30)
search_time = time.time() - start
print(f"Done searching in {search_time:.4}s.")

# save the results
labels = labels + 1 # FAISS is 0-indexed, groundtruth is 1-indexed
identifier = f"index=({index_identifier}),query=(ef={ef})"
store_results(os.path.join("result/", kind, size, f"{identifier}.h5"), index_identifier, kind, distances, labels, build_time, search_time, identifier, size)



if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--size",
type=str,
default="300K"
)
parser.add_argument(
"-M",
type=int,
default=20,
)
parser.add_argument(
"-E",
type=int,
default=100,
)
args = parser.parse_args()
assert args.size in ["300K", "10M", "100M"]

print("Running Script With:")
print(f" * N={args.size}")
print(f" * M={args.M} | HNSW Parameter M")
print(f" * EFC={args.E} | HNSW Parameter ef_construction")
run(args.size, args.M, args.E)
print(f"Done! Have a good day!")
50 changes: 50 additions & 0 deletions search/search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
'''
Cole Foster
August 4th, 2024
SISAP 2024 Indexing Challenge
'''
import argparse
import GraphHierarchy
import h5py
import numpy as np
import os
from pathlib import Path
from urllib.request import urlretrieve
import time


def run(size, M, ef_construction):

#> Initialize the HNSW index
index = GraphHierarchy.Index(space='ip', dim=768) # possible options are l2, cosine or ip




if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--size",
type=str,
default="300K"
)
parser.add_argument(
"-M",
type=int,
default=20,
)
parser.add_argument(
"-E",
type=int,
default=100,
)
args = parser.parse_args()
assert args.size in ["300K", "10M", "100M"]

print("Running Script With:")
print(f" * N={args.size}")
print(f" * M={args.M} | HNSW Parameter M")
print(f" * EFC={args.E} | HNSW Parameter ef_construction")
run(args.size, args.M, args.E)
print(f"Done! Have a good day!")
13 changes: 13 additions & 0 deletions src/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
hnswlib.egg-info/
build/
dist/
tmp/
python_bindings/tests/__pycache__/
*.pyd
hnswlib.cpython*.so
var/
.idea/
.vscode/
.vs/
**.DS_Store
*.pyc
105 changes: 105 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
cmake_minimum_required(VERSION 3.0...3.26)

project(hnswlib
LANGUAGES CXX)

include(GNUInstallDirs)
include(CheckCXXCompilerFlag)

add_library(hnswlib INTERFACE)
add_library(hnswlib::hnswlib ALIAS hnswlib)

target_include_directories(hnswlib INTERFACE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)

# Install
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/hnswlib
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

install(TARGETS hnswlib
EXPORT hnswlibTargets)

install(EXPORT hnswlibTargets
FILE hnswlibConfig.cmake
NAMESPACE hnswlib::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hnswlib)

# Examples and tests
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
option(HNSWLIB_EXAMPLES "Build examples and tests." ON)
else()
option(HNSWLIB_EXAMPLES "Build examples and tests." OFF)
endif()
if(HNSWLIB_EXAMPLES)
set(CMAKE_CXX_STANDARD 11)

if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
SET( CMAKE_CXX_FLAGS "-Ofast -std=c++11 -DHAVE_CXX0X -openmp -fpic -ftree-vectorize" )
check_cxx_compiler_flag("-march=native" COMPILER_SUPPORT_NATIVE_FLAG)
if(COMPILER_SUPPORT_NATIVE_FLAG)
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native" )
message("set -march=native flag")
else()
check_cxx_compiler_flag("-mcpu=apple-m1" COMPILER_SUPPORT_M1_FLAG)
if(COMPILER_SUPPORT_M1_FLAG)
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=apple-m1" )
message("set -mcpu=apple-m1 flag")
endif()
endif()
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
SET( CMAKE_CXX_FLAGS "/O2 -DHAVE_CXX0X /W1 /openmp /EHsc" )
endif()

# examples
add_executable(example_search examples/cpp/example_search.cpp)
target_link_libraries(example_search hnswlib)

add_executable(example_epsilon_search examples/cpp/example_epsilon_search.cpp)
target_link_libraries(example_epsilon_search hnswlib)

add_executable(example_multivector_search examples/cpp/example_multivector_search.cpp)
target_link_libraries(example_multivector_search hnswlib)

add_executable(example_filter examples/cpp/example_filter.cpp)
target_link_libraries(example_filter hnswlib)

add_executable(example_replace_deleted examples/cpp/example_replace_deleted.cpp)
target_link_libraries(example_replace_deleted hnswlib)

add_executable(example_mt_search examples/cpp/example_mt_search.cpp)
target_link_libraries(example_mt_search hnswlib)

add_executable(example_mt_filter examples/cpp/example_mt_filter.cpp)
target_link_libraries(example_mt_filter hnswlib)

add_executable(example_mt_replace_deleted examples/cpp/example_mt_replace_deleted.cpp)
target_link_libraries(example_mt_replace_deleted hnswlib)

# tests
add_executable(multivector_search_test tests/cpp/multivector_search_test.cpp)
target_link_libraries(multivector_search_test hnswlib)

add_executable(epsilon_search_test tests/cpp/epsilon_search_test.cpp)
target_link_libraries(epsilon_search_test hnswlib)

add_executable(test_updates tests/cpp/updates_test.cpp)
target_link_libraries(test_updates hnswlib)

add_executable(searchKnnCloserFirst_test tests/cpp/searchKnnCloserFirst_test.cpp)
target_link_libraries(searchKnnCloserFirst_test hnswlib)

add_executable(searchKnnWithFilter_test tests/cpp/searchKnnWithFilter_test.cpp)
target_link_libraries(searchKnnWithFilter_test hnswlib)

add_executable(multiThreadLoad_test tests/cpp/multiThreadLoad_test.cpp)
target_link_libraries(multiThreadLoad_test hnswlib)

add_executable(multiThread_replace_test tests/cpp/multiThread_replace_test.cpp)
target_link_libraries(multiThread_replace_test hnswlib)

add_executable(main tests/cpp/main.cpp tests/cpp/sift_1b.cpp)
target_link_libraries(main hnswlib)
endif()
Loading

0 comments on commit 80cee5a

Please sign in to comment.