@@ -75,18 +75,19 @@ def __init__(self, data_size = 0, initialCap = 0, M = 32, ef_c = 512, ef_r = 10,
75
75
76
76
data = load_data ("dbpedia-768" )
77
77
self .num_elements = data_size if data_size != 0 else data .shape [0 ]
78
- self .initialCap = initialCap if initialCap != 0 else 2 * self .num_elements
78
+ #self.initialCap = initialCap if initialCap != 0 else 2 * self.num_elements
79
+ self .initialCap = initialCap if initialCap != 0 else self .num_elements
79
80
80
81
self .data = data [:self .num_elements ]
81
82
self .dim = len (self .data [0 ])
82
83
self .metric = metric
83
- self .type = data_type
84
+ self .data_type = data_type
84
85
self .is_multi = is_multi
85
86
86
87
self .hnsw_params = create_hnsw_params (dim = self .dim ,
87
88
num_elements = self .initialCap ,
88
89
metric = self .metric ,
89
- data_type = self .type ,
90
+ data_type = self .data_type ,
90
91
ef_construction = ef_c ,
91
92
m = M ,
92
93
ef_runtime = ef_r ,
@@ -102,22 +103,18 @@ def create_tiered(self):
102
103
103
104
def create_hnsw (self ):
104
105
return HNSWIndex (self .hnsw_params )
105
-
106
- def set_num_vectors_per_label (self , num_per_label = 1 ):
107
- self .num_per_label = num_per_label
108
106
109
107
def init_and_populate_flat_index (self ):
110
108
bfparams = BFParams ()
111
109
bfparams .initialCapacity = self .num_elements
112
110
bfparams .dim = self .dim
113
- bfparams .type = self .type
111
+ bfparams .type = self .data_type
114
112
bfparams .metric = self .metric
115
113
bfparams .multi = self .is_multi
116
114
self .flat_index = BFIndex (bfparams )
117
115
118
116
for i , vector in enumerate (self .data ):
119
- for _ in range (self .num_per_label ):
120
- self .flat_index .add_vector (vector , i )
117
+ self .flat_index .add_vector (vector , i )
121
118
122
119
return self .flat_index
123
120
@@ -129,6 +126,16 @@ def init_and_populate_hnsw_index(self):
129
126
self .hnsw_index = hnsw_index
130
127
return hnsw_index
131
128
129
+ def populate_index (self , index ):
130
+ start = time .time ()
131
+ duration = 0
132
+ for label , vector in enumerate (self .data ):
133
+ start_add = time .time ()
134
+ index .add_vector (vector , label )
135
+ duration += time .time () - start_add
136
+ end = time .time ()
137
+ return (start , duration , end )
138
+
132
139
def generate_random_vectors (self , num_vectors ):
133
140
vectors = 0
134
141
np_file_path = os .path .join (f'np_{ num_vectors } vec_dim{ self .dim } .npy' )
@@ -154,7 +161,12 @@ def insert_in_batch(self, index, data, data_first_idx, batch_size, first_label):
154
161
duration += time .time () - start_add
155
162
end = time .time ()
156
163
return (duration , end )
164
+
165
+ def generate_queries (self , num_queries ):
166
+ self .rng = np .random .default_rng (seed = 47 )
157
167
168
+ queries = self .rng .random ((num_queries , self .dim ))
169
+ return np .float32 (queries ) if self .data_type == VecSimType_FLOAT32 else queries
158
170
159
171
def create_dbpedia ():
160
172
indices_ctx = DBPediaIndexCtx ()
@@ -192,7 +204,7 @@ def create_tiered():
192
204
create_tiered ()
193
205
194
206
def create_dbpedia_graph ():
195
- indices_ctx = DBPediaIndexCtx (data_size = 100000 )
207
+ indices_ctx = DBPediaIndexCtx ()
196
208
197
209
threads_num = TIEREDIndex .get_threads_num ()
198
210
print (f"thread num = { threads_num } " )
@@ -283,9 +295,68 @@ def create_hnsw():
283
295
print (f"Start hnsw creation" )
284
296
285
297
create_hnsw ()
298
+
299
+ def search_insert (is_multi : bool , num_per_label = 1 ):
300
+ indices_ctx = DBPediaIndexCtx (data_size = 1000 , mode = CreationMode .CREATE_TIERED_INDEX , is_multi = is_multi )
301
+ index = indices_ctx .tiered_index
302
+
303
+ num_elements = indices_ctx .num_elements
304
+
305
+ query_data = indices_ctx .generate_queries (num_queries = 1 )
306
+
307
+ # Add vectors to the flat index.
308
+ bf_index = indices_ctx .init_and_populate_flat_index ()
309
+
310
+ # Start background insertion to the tiered index.
311
+ index_start , _ , _ = indices_ctx .populate_index (index )
312
+
313
+ correct = 0
314
+ k = 10
315
+ searches_number = 0
316
+
317
+ # config knn log
318
+ index .start_knn_log ()
319
+
320
+ # run knn query every 1 s.
321
+ total_tiered_search_time = 0
322
+ prev_bf_size = num_elements
323
+ while index .hnsw_label_count () < num_elements :
324
+ # For each run get the current hnsw size and the query time.
325
+ bf_curr_size = index .get_curr_bf_size (mode = 'insert_and_knn' )
326
+ query_start = time .time ()
327
+ tiered_labels , _ = index .knn_query (query_data , k )
328
+ query_dur = time .time () - query_start
329
+ total_tiered_search_time += query_dur
330
+
331
+ print (f"query time = { round_ms (query_dur )} ms" )
332
+
333
+ # BF size should decrease.
334
+ print (f"bf size = { bf_curr_size } " )
335
+ assert bf_curr_size < prev_bf_size
336
+
337
+ # Run the query also in the bf index to get the ground truth results.
338
+ bf_labels , _ = bf_index .knn_query (query_data , k )
339
+ correct += len (np .intersect1d (tiered_labels [0 ], bf_labels [0 ]))
340
+ time .sleep (1 )
341
+ searches_number += 1
342
+ prev_bf_size = bf_curr_size
343
+
344
+ index .reset_log ()
345
+
346
+ # HNSW labels count updates before the job is done, so we need to wait for the queue to be empty.
347
+ index .wait_for_index (1 )
348
+ index_dur = time .time () - index_start
349
+ print (f"indexing during search in tiered took { round_ (index_dur )} s" )
350
+
351
+ # Measure recall.
352
+ recall = float (correct )/ (k * searches_number )
353
+ print ("Average recall is:" , round_ (recall , 3 ))
354
+ print ("tiered query per seconds: " , round_ (searches_number / total_tiered_search_time ))
286
355
287
356
def test_main ():
288
357
print ("Test creation" )
289
- # create_dbpedia()
290
- create_dbpedia_graph ()
358
+ create_dbpedia ()
359
+ # create_dbpedia_graph()
360
+ print (f"\n Start insert & search test" )
361
+ # search_insert(is_multi=False)
291
362
0 commit comments