Skip to content

Commit f988b58

Browse files
lerman25rfsaliev
andauthored
[MOD-10236] Add serialized SVS indices (#792)
* generalize * remove serializer.cpp from cmake * prepare merge with rafik commit * [SVS] Implement Save/Load + test * seperate hnsw_serializer to h and cpp * remove get version impl * save impl * add load * change camelcase * for mat * generalzie saveIndexFields * format * compare metadata on load * Add checkIntegrity with error * checkIntegrity * remove duplicate verification in compare meta data * format * svs serializetion version testing * Revert "svs serializetion version testing" This reverts commit 9ed7730. * common serializer test * remove changes_num from metadata * Add location c'tor * Add location ctor and to test * Remove outdated comment from serializer header * Enhance documentation for loadIndex function in SVSIndex * Add comments * format + remove test * enable tests * serializer test * format * reset SVS to master * add logging to test_svs * format * remove duplicate NewIndexImpl * expose loadIndex in VecSimIndex, add BUILD_TEST gurad * remove string ctor from SVSIndex * format * fix BUILD_TEST in svs_factory * document loadIndex * move loadIndex to serializer * remove excess declarations * remove extra ; * compatable -> compatible * remove redundant params from test * remove comments from threadpool_handle * remove error context comments * add checkIntegrity * update checkIntegrity and format * move loadIndex to SVSSerializer * update bindings * format * add test * add single * adjust labels * Refactor save_load test to simplify vector generation logic * add HAVE_SVS guard * Add missing include for <sstream> in svs_serializer.h * Add labels count to bindings * Files and script * format * Fix comments, add assert --------- Co-authored-by: Rafik Saliev <[email protected]>
1 parent 961d6eb commit f988b58

File tree

5 files changed

+163
-1
lines changed

5 files changed

+163
-1
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# Ignore benchmark fetched data but not the source file
1212
/tests/benchmark/data/*
1313
!/tests/benchmark/data/hnsw_indices
14+
!/tests/benchmark/data/svs_indices/
1415
!/tests/benchmark/data/*.py
1516

1617
# Prerequisites

src/python_bindings/bindings.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -619,6 +619,8 @@ class PySVSIndex : public PyVecSimIndex {
619619
assert(svs_index);
620620
svs_index->loadIndex(location);
621621
}
622+
623+
size_t getLabelsCount() const { return this->index->debugInfo().commonInfo.indexLabelCount; }
622624
};
623625

624626
class PyTiered_SVSIndex : public PyTieredIndex {
@@ -845,7 +847,8 @@ PYBIND11_MODULE(VecSim, m) {
845847
py::arg("labels"))
846848
.def("check_integrity", &PySVSIndex::checkIntegrity)
847849
.def("save_index", &PySVSIndex::saveIndex, py::arg("location"))
848-
.def("load_index", &PySVSIndex::loadIndex, py::arg("location"));
850+
.def("load_index", &PySVSIndex::loadIndex, py::arg("location"))
851+
.def("get_labels_count", &PySVSIndex::getLabelsCount);
849852

850853
py::class_<PyTiered_SVSIndex, PyTieredIndex>(m, "Tiered_SVSIndex")
851854
.def(py::init([](const SVSParams &svs_params, const TieredSVSParams &tiered_svs_params,
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# SVS index serializer for benchmarks.
2+
# Serializes datasets to SVS index format for use by C++ and Python benchmarks.
3+
4+
import numpy as np
5+
import VecSim
6+
import h5py
7+
import os
8+
9+
# Determine working directory
10+
location = os.path.abspath('.')
11+
if location.endswith('/data'):
12+
location = os.path.join(location, '')
13+
elif location.endswith('/VectorSimilarity'):
14+
location = os.path.join(location, 'tests', 'benchmark', 'data', '')
15+
else:
16+
print('unexpected location:', location)
17+
print('expected to be in `./VectorSimilarity/tests/benchmark/data` or `./VectorSimilarity`')
18+
exit(1)
19+
print('working at:', location)
20+
21+
DEFAULT_FILES = [
22+
{
23+
'filename': 'dbpedia-768',
24+
'nickname': 'dbpedia',
25+
'dim': 768,
26+
'metric': VecSim.VecSimMetric_Cosine,
27+
'hdf5_file': 'dbpedia-cosine-dim768.hdf5',
28+
},
29+
{
30+
'filename': 'fashion_images_multi_value',
31+
'nickname': 'fashion_images_multi_value',
32+
'hdf5_file': 'fashion_images_multi_value-cosine-dim512.hdf5',
33+
'dim': 512,
34+
'metric': VecSim.VecSimMetric_Cosine,
35+
'multi': True,
36+
},
37+
]
38+
39+
TYPES_ATTR = {
40+
VecSim.VecSimType_FLOAT32: {"size_in_bytes": 4, "vector_type": np.float32},
41+
}
42+
43+
def load_vectors_and_labels_from_hdf5(input_file):
44+
"""
45+
Load vectors and labels from an HDF5 file.
46+
Returns: (vectors, labels) numpy arrays, or (None, None) on failure.
47+
"""
48+
try:
49+
with h5py.File(input_file, 'r') as f:
50+
vectors = f['vectors'][:]
51+
labels = f['labels'][:]
52+
53+
print(f"Loaded {input_file}: vectors {vectors.shape}, labels {labels.shape}")
54+
return vectors, labels
55+
56+
except Exception as e:
57+
print(f"Error loading HDF5 file: {e}")
58+
return None, None
59+
60+
def serialize(files=DEFAULT_FILES):
61+
for file in files:
62+
filename = file['filename']
63+
nickname = file.get('nickname', filename)
64+
dim = file.get('dim', None)
65+
metric = file['metric']
66+
is_multi = file.get('multi', False)
67+
vec_type = file.get('type', VecSim.VecSimType_FLOAT32)
68+
69+
# Load vectors/labels
70+
hdf5_file = file.get('hdf5_file', f"{filename}.hdf5")
71+
hdf5_path = os.path.join(location, hdf5_file)
72+
print(f"Loading vectors from {hdf5_path}")
73+
74+
if is_multi:
75+
if vectors.ndim == 3:
76+
vectors = vectors.reshape(-1, vectors.shape[-1])
77+
labels = np.repeat(labels, vectors.shape[0] // labels.shape[0])
78+
79+
if vectors is None or labels is None:
80+
print(f"Failed to load data from {hdf5_path}, skipping...")
81+
continue
82+
83+
# Handle shape (N, 1, D) -> (N, D)
84+
if not is_multi:
85+
if vectors.ndim == 3 and vectors.shape[1] == 1:
86+
vectors = vectors.squeeze(axis=1)
87+
elif vectors.ndim != 2:
88+
print(f"Error: Expected 2D vectors, got shape {vectors.shape}")
89+
continue
90+
91+
# Update dimension if not specified
92+
if dim is None:
93+
dim = vectors.shape[1]
94+
print(f"Auto-detected dimension: {dim}")
95+
96+
assert dim == vectors.shape[1], f"Dimension mismatch: {dim} != {vectors.shape[1]}"
97+
98+
# Create SVS parameters
99+
bits_to_str = {
100+
VecSim.VecSimSvsQuant_NONE: '_none',
101+
VecSim.VecSimSvsQuant_8: '_8',
102+
}
103+
for bits in [VecSim.VecSimSvsQuant_8, VecSim.VecSimSvsQuant_NONE]:
104+
svs_params = VecSim.SVSParams()
105+
svs_params.type = vec_type
106+
svs_params.dim = dim
107+
svs_params.metric = metric
108+
svs_params.graph_max_degree = file.get('graph_max_degree', 128)
109+
svs_params.construction_window_size = file.get('construction_window_size', 512)
110+
svs_params.quantBits = bits
111+
svs_params.multi = is_multi
112+
113+
114+
print(f"Creating SVS index for {filename} (dim={dim}, metric={metric})")
115+
if vectors.dtype != np.float32:
116+
print(f"Converting vectors from {vectors.dtype} to float32")
117+
vectors = vectors.astype(np.float32)
118+
if labels.dtype != np.uint64:
119+
print(f"Converting labels from {labels.dtype} to uint64")
120+
labels = labels.astype(np.uint64)
121+
122+
# Create index and add vectors
123+
svs_index = VecSim.SVSIndex(svs_params)
124+
print(f"Adding {len(vectors)} vectors...")
125+
svs_index.add_vector_parallel(vectors, labels)
126+
127+
# Save index
128+
dir = os.path.join(location, nickname + '_svs'+bits_to_str[bits])
129+
os.makedirs(dir, exist_ok=True)
130+
svs_index.save_index(dir)
131+
print(f"Index saved to {dir}")
132+
print(f"Final index size: {svs_index.index_size()}")
133+
134+
# Verify
135+
print("Verifying saved index...")
136+
svs_index_verify = VecSim.SVSIndex(svs_params)
137+
svs_index_verify.load_index(dir)
138+
svs_index_verify.check_integrity()
139+
print(f"Verified index size: {svs_index_verify.index_size()}")
140+
assert svs_index_verify.get_labels_count() == labels.max() + 1
141+
142+
143+
if __name__ == '__main__':
144+
serialize()
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/svs-dbpedia-cosine-dim768-quant-8.tar.gz
2+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/svs-dbpedia-cosine-dim768-quant-none.tar.gz
3+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/dbpedia-cosine-dim768-test_vectors.raw
4+
5+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/svs_fashion_images_multi_value-cosine-dim512-quant-none.tar.gz
6+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/svs_fashion_images_multi_value-cosine-dim512-quant-8.tar.gz
7+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/fashion_images_multi_value-cosine-dim512-test_vectors.raw
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/svs-dbpedia-cosine-dim768-quant-8.tar.gz
2+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/svs-dbpedia-cosine-dim768-quant-none.tar.gz
3+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/dbpedia-cosine-dim768-test_vectors.raw
4+
5+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/svs_fashion_images_multi_value-cosine-dim512-quant-none.tar.gz
6+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/svs_fashion_images_multi_value-cosine-dim512-quant-8.tar.gz
7+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/fashion_images_multi_value-cosine-dim512-test_vectors.raw

0 commit comments

Comments
 (0)