Skip to content

Commit 9b5007e

Browse files
add int filter
1 parent 54c94e2 commit 9b5007e

File tree

10 files changed

+319
-11
lines changed

10 files changed

+319
-11
lines changed

vectordb_bench/backend/cases.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from vectordb_bench import config
66
from vectordb_bench.backend.clients.api import MetricType
7-
from vectordb_bench.backend.filter import Filter, FilterOp, IntFilter, LabelFilter, NonFilter, non_filter
7+
from vectordb_bench.backend.filter import Filter, FilterOp, IntFilter, LabelFilter, NewIntFilter, NonFilter, non_filter
88
from vectordb_bench.base import BaseModel
99
from vectordb_bench.frontend.components.custom.getCustomConfig import CustomDatasetConfig
1010

@@ -54,6 +54,8 @@ class CaseType(Enum):
5454

5555
LabelFilterPerformanceCase = 300
5656

57+
NewIntFilterPerformanceCase = 400
58+
5759
def case_cls(self, custom_configs: dict | None = None) -> type["Case"]:
5860
if custom_configs is None:
5961
return type2case.get(self)()
@@ -130,6 +132,7 @@ class PerformanceCase(Case):
130132
filter_rate: float | None = None
131133
load_timeout: float | int = config.LOAD_TIMEOUT_DEFAULT
132134
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_DEFAULT
135+
int_value: float | None = None
133136

134137

135138
class CapacityDim960(CapacityCase):
@@ -471,6 +474,46 @@ def __init__(
471474
)
472475

473476

477+
class NewIntFilterPerformanceCase(PerformanceCase):
478+
case_id: CaseType = CaseType.NewIntFilterPerformanceCase
479+
dataset_with_size_type: DatasetWithSizeType
480+
filter_rate: float
481+
482+
def __init__(
483+
self,
484+
dataset_with_size_type: DatasetWithSizeType | str,
485+
filter_rate: float,
486+
int_value: float | None = 0,
487+
**kwargs,
488+
):
489+
if not isinstance(dataset_with_size_type, DatasetWithSizeType):
490+
dataset_with_size_type = DatasetWithSizeType(dataset_with_size_type)
491+
name = f"Int-Filter-{filter_rate*100:.1f}% - {dataset_with_size_type.value}"
492+
description = f"Int-Filter-{filter_rate*100:.1f}% Performance Test ({dataset_with_size_type.value})"
493+
dataset = dataset_with_size_type.get_manager()
494+
load_timeout = dataset_with_size_type.get_load_timeout()
495+
optimize_timeout = dataset_with_size_type.get_optimize_timeout()
496+
filters = IntFilter(filter_rate=filter_rate, int_value=int_value)
497+
filter_rate = filters.filter_rate
498+
super().__init__(
499+
name=name,
500+
description=description,
501+
dataset=dataset,
502+
load_timeout=load_timeout,
503+
optimize_timeout=optimize_timeout,
504+
filter_rate=filter_rate,
505+
int_value=int_value,
506+
dataset_with_size_type=dataset_with_size_type,
507+
**kwargs,
508+
)
509+
510+
@property
511+
def filters(self) -> Filter:
512+
int_field = self.dataset.data.train_id_field
513+
int_value = int(self.dataset.data.size * self.filter_rate)
514+
return NewIntFilter(filter_rate=self.filter_rate, int_field=int_field, int_value=int_value)
515+
516+
474517
class LabelFilterPerformanceCase(PerformanceCase):
475518
case_id: CaseType = CaseType.LabelFilterPerformanceCase
476519
dataset_with_size_type: DatasetWithSizeType
@@ -529,5 +572,6 @@ def filters(self) -> Filter:
529572
CaseType.Performance1536D50K: Performance1536D50K,
530573
CaseType.PerformanceCustomDataset: PerformanceCustomDataset,
531574
CaseType.StreamingPerformanceCase: StreamingPerformanceCase,
575+
CaseType.NewIntFilterPerformanceCase: NewIntFilterPerformanceCase,
532576
CaseType.LabelFilterPerformanceCase: LabelFilterPerformanceCase,
533577
}

vectordb_bench/backend/dataset.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ class BaseDataset(BaseModel):
4848
scalar_labels_file_separated: bool = True
4949
scalar_labels_file: str = "scalar_labels.parquet"
5050
scalar_label_percentages: list[float] = []
51+
scalar_int_rates: list[float] = []
5152
train_id_field: str = "id"
5253
train_vector_field: str = "emb"
5354
test_file: str = "test.parquet"
@@ -164,6 +165,29 @@ class Cohere(BaseDataset):
164165
}
165166
with_scalar_labels: bool = True
166167
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
168+
scalar_int_rates: list[float] = [
169+
0.001,
170+
0.002,
171+
0.005,
172+
0.01,
173+
0.02,
174+
0.05,
175+
0.1,
176+
0.2,
177+
0.3,
178+
0.4,
179+
0.5,
180+
0.6,
181+
0.7,
182+
0.8,
183+
0.9,
184+
0.95,
185+
0.98,
186+
0.99,
187+
0.995,
188+
0.998,
189+
0.999,
190+
]
167191

168192

169193
class Bioasq(BaseDataset):
@@ -178,6 +202,29 @@ class Bioasq(BaseDataset):
178202
}
179203
with_scalar_labels: bool = True
180204
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
205+
scalar_int_rates: list[float] = [
206+
0.001,
207+
0.002,
208+
0.005,
209+
0.01,
210+
0.02,
211+
0.05,
212+
0.1,
213+
0.2,
214+
0.3,
215+
0.4,
216+
0.5,
217+
0.6,
218+
0.7,
219+
0.8,
220+
0.9,
221+
0.95,
222+
0.98,
223+
0.99,
224+
0.995,
225+
0.998,
226+
0.999,
227+
]
181228

182229

183230
class Glove(BaseDataset):
@@ -217,6 +264,29 @@ class OpenAI(BaseDataset):
217264
}
218265
with_scalar_labels: bool = True
219266
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
267+
scalar_int_rates: list[float] = [
268+
0.001,
269+
0.002,
270+
0.005,
271+
0.01,
272+
0.02,
273+
0.05,
274+
0.1,
275+
0.2,
276+
0.3,
277+
0.4,
278+
0.5,
279+
0.6,
280+
0.7,
281+
0.8,
282+
0.9,
283+
0.95,
284+
0.98,
285+
0.99,
286+
0.995,
287+
0.998,
288+
0.999,
289+
]
220290

221291

222292
class DatasetManager(BaseModel):

vectordb_bench/backend/filter.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,23 @@ def groundtruth_file(self) -> str:
5151
raise RuntimeError(msg)
5252

5353

54+
class NewIntFilter(Filter):
55+
type: FilterOp = FilterOp.NumGE
56+
int_field: str = "id"
57+
int_value: int
58+
59+
@property
60+
def int_rate(self) -> str:
61+
r = self.filter_rate * 100
62+
if 1 <= r <= 99:
63+
return f"int_{int(r)}p"
64+
return f"int_{r:.1f}p"
65+
66+
@property
67+
def groundtruth_file(self) -> str:
68+
return f"neighbors_{self.int_rate}.parquet"
69+
70+
5471
class LabelFilter(Filter):
5572
"""
5673
filter expr: label_field == label_value, like `color == "red"`

vectordb_bench/frontend/components/check_results/filters.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def getShowDbsAndCases(st, result: list[CaseResult], filter_type: FilterOp) -> t
8989
col=1,
9090
)
9191

92-
if filter_type == FilterOp.StrEqual:
92+
if filter_type == FilterOp.StrEqual or filter_type == FilterOp.NumGE:
9393
container = st.container()
9494
datasetWithSizeTypes = [dataset_with_size_type for dataset_with_size_type in DatasetWithSizeType]
9595
showDatasetWithSizeTypes = filterView(
@@ -102,9 +102,6 @@ def getShowDbsAndCases(st, result: list[CaseResult], filter_type: FilterOp) -> t
102102
datasets = [dataset_with_size_type.get_manager() for dataset_with_size_type in showDatasetWithSizeTypes]
103103
showCaseNames = list(set([case.name for case in allCases if case.dataset in datasets]))
104104

105-
if filter_type == FilterOp.NumGE:
106-
raise NotImplementedError
107-
108105
return showDBNames, showCaseNames
109106

110107

vectordb_bench/frontend/components/check_results/nav.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def NavToPages(st):
2929
{"name": "Quries Per Dollar", "link": "quries_per_dollar"},
3030
{"name": "Concurrent", "link": "concurrent"},
3131
{"name": "Label Filter", "link": "label_filter"},
32+
{"name": "Int Filter", "link": "int_filter"},
3233
{"name": "Streaming", "link": "streaming"},
3334
{"name": "Tables", "link": "tables"},
3435
{"name": "Custom Dataset", "link": "custom"},
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import plotly.express as px
2+
from vectordb_bench.metric import metric_unit_map
3+
4+
5+
def drawCharts(st, allData, **kwargs):
6+
dataset_names = list(set([data["dataset_name"] for data in allData]))
7+
dataset_names.sort()
8+
for dataset_name in dataset_names:
9+
container = st.container()
10+
container.subheader(dataset_name)
11+
data = [d for d in allData if d["dataset_name"] == dataset_name]
12+
drawChartByMetric(container, data, **kwargs)
13+
14+
15+
def drawChartByMetric(st, data, metrics=("qps", "recall"), **kwargs):
16+
columns = st.columns(len(metrics))
17+
for i, metric in enumerate(metrics):
18+
container = columns[i]
19+
container.markdown(f"#### {metric}")
20+
drawChart(container, data, metric)
21+
22+
23+
def getRange(metric, data, padding_multipliers):
24+
minV = min([d.get(metric, 0) for d in data])
25+
maxV = max([d.get(metric, 0) for d in data])
26+
padding = maxV - minV
27+
rangeV = [
28+
minV - padding * padding_multipliers[0],
29+
maxV + padding * padding_multipliers[1],
30+
]
31+
return rangeV
32+
33+
34+
def drawChart(st, data: list[object], metric):
35+
unit = metric_unit_map.get(metric, "")
36+
x = "filter_rate"
37+
xrange = getRange(x, data, [0.05, 0.1])
38+
39+
y = metric
40+
yrange = getRange(y, data, [0.2, 0.1])
41+
42+
data.sort(key=lambda a: a[x])
43+
44+
fig = px.line(
45+
data,
46+
x=x,
47+
y=y,
48+
color="db_name",
49+
line_group="db_name",
50+
text=metric,
51+
markers=True,
52+
)
53+
fig.update_xaxes(range=xrange)
54+
fig.update_yaxes(range=yrange)
55+
fig.update_traces(textposition="bottom right", texttemplate="%{y:,.4~r}" + unit)
56+
fig.update_layout(
57+
margin=dict(l=0, r=0, t=40, b=0, pad=8),
58+
legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="right", x=1, title=""),
59+
)
60+
st.plotly_chart(fig, use_container_width=True)

vectordb_bench/frontend/components/welcome/welcomePrams.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,12 +94,22 @@ def welcomePrams(st):
9494
"title": "Label Filter Performance",
9595
"description": (
9696
"<span style='font-size: 17px;'>"
97-
"To view the perfomance of datasets under different filter ratios "
97+
"To view the perfomance of datasets under different label filter ratios "
9898
"</span>"
9999
),
100100
"image": "fig/homepage/label_filter.png",
101101
"link": "label_filter",
102102
},
103+
{
104+
"title": "Int Filter Performance",
105+
"description": (
106+
"<span style='font-size: 17px;'>"
107+
"To view the perfomance of datasets under different int filter ratios "
108+
"</span>"
109+
),
110+
"image": "fig/homepage/label_filter.png",
111+
"link": "int_filter",
112+
},
103113
{
104114
"title": "Streaming Performance",
105115
"description": (
@@ -138,7 +148,7 @@ def welcomePrams(st):
138148
for option in options:
139149
option["image"] = get_image_as_base64(option["image"])
140150

141-
for option in options[:6]:
151+
for option in options[:7]:
142152
html_content += f"""
143153
<a href="/{option['link']}" target="_self" style="text-decoration: none;">
144154
<div class="section-card">
@@ -157,7 +167,7 @@ def welcomePrams(st):
157167
<div class="last-row">
158168
"""
159169

160-
for option in options[6:8]:
170+
for option in options[7:9]:
161171
html_content += f"""
162172
<a href="/{option['link']}" target="_self" style="text-decoration: none;">
163173
<div class="section-card">

vectordb_bench/frontend/config/dbCaseConfigs.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,17 @@ def generate_label_filter_cases(dataset_with_size_type: DatasetWithSizeType) ->
219219
]
220220

221221

222+
def generate_int_filter_cases(dataset_with_size_type: DatasetWithSizeType) -> list[CaseConfig]:
223+
filter_rates = dataset_with_size_type.get_manager().data.scalar_int_rates
224+
return [
225+
CaseConfig(
226+
case_id=CaseType.NewIntFilterPerformanceCase,
227+
custom_case=dict(dataset_with_size_type=dataset_with_size_type, filter_rate=filter_rate),
228+
)
229+
for filter_rate in filter_rates
230+
]
231+
232+
222233
UI_CASE_CLUSTERS: list[UICaseItemCluster] = [
223234
UICaseItemCluster(
224235
label="Search Performance Test",
@@ -249,6 +260,29 @@ def generate_label_filter_cases(dataset_with_size_type: DatasetWithSizeType) ->
249260
UICaseItem(cases=generate_normal_cases(CaseType.Performance1536D500K99P)),
250261
],
251262
),
263+
UICaseItemCluster(
264+
label="New-Int-Filter Search Performance Test",
265+
uiCaseItems=[
266+
UICaseItem(
267+
label=f"Int-Filter Search Performance Test - {dataset_with_size_type.value}",
268+
description=(
269+
f"[Batch Cases]These cases test the search performance of a vector database "
270+
f"with dataset {dataset_with_size_type.value}"
271+
f"under filtering rates of {dataset_with_size_type.get_manager().data.scalar_int_rates}, at varying parallel levels."
272+
f"Results will show index building time, recall, and maximum QPS."
273+
),
274+
cases=generate_int_filter_cases(dataset_with_size_type),
275+
)
276+
for dataset_with_size_type in [
277+
DatasetWithSizeType.CohereMedium,
278+
DatasetWithSizeType.CohereLarge,
279+
DatasetWithSizeType.OpenAIMedium,
280+
DatasetWithSizeType.OpenAILarge,
281+
DatasetWithSizeType.BioasqMedium,
282+
DatasetWithSizeType.BioasqLarge,
283+
]
284+
],
285+
),
252286
UICaseItemCluster(
253287
label="Label-Filter Search Performance Test",
254288
uiCaseItems=[

0 commit comments

Comments
 (0)