Skip to content

Commit 8410891

Browse files
Workload (#53)
**Summary**: basic `Workload` class that lets you read a workload directory. **Demo**: Added tests that pass. <img width="626" alt="Screenshot 2024-12-23 at 14 59 15" src="https://github.com/user-attachments/assets/96bfd927-0842-4e37-a3c7-e9036bc36486" />
1 parent 1d929d3 commit 8410891

9 files changed

+111
-21
lines changed

benchmark/tpch/cli.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import click
55

66
from benchmark.constants import DEFAULT_SCALE_FACTOR
7+
from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
78
from util.log import DBGYM_LOGGER_NAME
89
from util.shell import subprocess_run
910
from util.workspace import (
@@ -15,8 +16,6 @@
1516
link_result,
1617
)
1718

18-
NUM_TPCH_QUERIES = 22
19-
2019

2120
@click.group(name="tpch")
2221
@click.pass_obj
@@ -38,13 +37,13 @@ def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None:
3837
@click.option(
3938
"--seed-start",
4039
type=int,
41-
default=15721,
40+
default=DEFAULT_TPCH_SEED,
4241
help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).",
4342
)
4443
@click.option(
4544
"--seed-end",
4645
type=int,
47-
default=15721,
46+
default=DEFAULT_TPCH_SEED,
4847
help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).",
4948
)
5049
@click.option(

benchmark/tpch/constants.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
DEFAULT_TPCH_SEED = 15721
2+
NUM_TPCH_QUERIES = 22

env/integtest_pg_conn.py

+10-14
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33

44
import psycopg
55

6-
from env.integtest_util import IntegtestWorkspace
6+
from env.integtest_util import (
7+
INTEGTEST_BENCHMARK,
8+
INTEGTEST_SCALE_FACTOR,
9+
IntegtestWorkspace,
10+
)
711
from env.pg_conn import PostgresConn
812
from util.pg import (
913
DEFAULT_POSTGRES_PORT,
@@ -12,19 +16,13 @@
1216
)
1317
from util.workspace import (
1418
DEFAULT_BOOT_CONFIG_FPATH,
15-
DBGymConfig,
1619
default_dbdata_parent_dpath,
1720
default_pgbin_path,
1821
default_pristine_dbdata_snapshot_path,
1922
)
2023

21-
BENCHMARK = "tpch"
22-
SCALE_FACTOR = 0.01
23-
2424

2525
class PostgresConnTests(unittest.TestCase):
26-
dbgym_cfg: DBGymConfig
27-
2826
@staticmethod
2927
def setUpClass() -> None:
3028
IntegtestWorkspace.set_up_workspace()
@@ -36,16 +34,14 @@ def setUp(self) -> None:
3634
+ "to ensure this. Be careful about accidentally taking down other people's Postgres instances though.",
3735
)
3836
self.pristine_dbdata_snapshot_path = default_pristine_dbdata_snapshot_path(
39-
IntegtestWorkspace.get_dbgym_cfg().dbgym_workspace_path,
40-
BENCHMARK,
41-
SCALE_FACTOR,
37+
IntegtestWorkspace.get_workspace_path(),
38+
INTEGTEST_BENCHMARK,
39+
INTEGTEST_SCALE_FACTOR,
4240
)
4341
self.dbdata_parent_dpath = default_dbdata_parent_dpath(
44-
IntegtestWorkspace.get_dbgym_cfg().dbgym_workspace_path
45-
)
46-
self.pgbin_dpath = default_pgbin_path(
47-
IntegtestWorkspace.get_dbgym_cfg().dbgym_workspace_path
42+
IntegtestWorkspace.get_workspace_path()
4843
)
44+
self.pgbin_dpath = default_pgbin_path(IntegtestWorkspace.get_workspace_path())
4945

5046
# The reason we restart Postgres every time is to ensure a "clean" starting point
5147
# so that all tests are independent of each other.

env/integtest_util.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66

77
from util.workspace import DBGymConfig
88

9+
# These are the values used by set_up_env_integtests.sh.
10+
# TODO: make set_up_env_integtests.sh take in these values directly as envvars.
11+
INTEGTEST_BENCHMARK = "tpch"
12+
INTEGTEST_SCALE_FACTOR = 0.01
13+
914

1015
class IntegtestWorkspace:
1116
"""
@@ -40,4 +45,3 @@ def get_dbgym_cfg() -> DBGymConfig:
4045
def get_workspace_path() -> Path:
4146
with open(IntegtestWorkspace.ENV_INTEGTESTS_DBGYM_CONFIG_FPATH) as f:
4247
return Path(yaml.safe_load(f)["dbgym_workspace_path"])
43-
assert False

env/integtest_workload.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import unittest
2+
3+
from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
4+
from env.integtest_util import (
5+
INTEGTEST_BENCHMARK,
6+
INTEGTEST_SCALE_FACTOR,
7+
IntegtestWorkspace,
8+
)
9+
from env.workload import Workload
10+
from util.workspace import (
11+
default_workload_path,
12+
fully_resolve_path,
13+
get_default_workload_name_suffix,
14+
get_workload_name,
15+
)
16+
17+
18+
class WorkloadTests(unittest.TestCase):
19+
@staticmethod
20+
def setUpClass() -> None:
21+
IntegtestWorkspace.set_up_workspace()
22+
23+
def test_workload(self) -> None:
24+
workload_dpath = fully_resolve_path(
25+
IntegtestWorkspace.get_dbgym_cfg(),
26+
default_workload_path(
27+
IntegtestWorkspace.get_workspace_path(),
28+
INTEGTEST_BENCHMARK,
29+
get_workload_name(
30+
INTEGTEST_SCALE_FACTOR,
31+
get_default_workload_name_suffix(INTEGTEST_BENCHMARK),
32+
),
33+
),
34+
)
35+
36+
workload = Workload(IntegtestWorkspace.get_dbgym_cfg(), workload_dpath)
37+
38+
# Check the order of query IDs.
39+
self.assertEqual(
40+
workload.get_query_order(),
41+
[f"S{DEFAULT_TPCH_SEED}-Q{i}" for i in range(1, NUM_TPCH_QUERIES + 1)],
42+
)
43+
44+
# Sanity check all queries.
45+
for query in workload.get_queries_in_order():
46+
self.assertTrue("select" in query.lower())
47+
48+
49+
if __name__ == "__main__":
50+
unittest.main()

env/set_up_env_integtests.sh

+2
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,7 @@ export DBGYM_CONFIG_PATH=env/env_integtests_dbgym_config.yaml # Note that this e
1616
WORKSPACE_PATH=$(grep 'dbgym_workspace_path:' $DBGYM_CONFIG_PATH | sed 's/dbgym_workspace_path: //')
1717

1818
python3 task.py benchmark $BENCHMARK data $SCALE_FACTOR
19+
python3 task.py benchmark $BENCHMARK workload --scale-factor $SCALE_FACTOR
20+
1921
python3 task.py dbms postgres build
2022
python3 task.py dbms postgres dbdata $BENCHMARK --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE

env/workload.py

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from pathlib import Path
2+
3+
from util.workspace import DBGymConfig, is_fully_resolved, open_and_save
4+
5+
6+
class Workload:
7+
def __init__(self, dbgym_cfg: DBGymConfig, workload_dpath: Path) -> None:
8+
self.dbgym_cfg = dbgym_cfg
9+
self.workload_dpath = workload_dpath
10+
assert is_fully_resolved(self.workload_dpath)
11+
12+
self.queries: dict[str, str] = {}
13+
order_fpath = self.workload_dpath / "order.txt"
14+
self.query_order: list[str] = []
15+
16+
assert order_fpath.exists()
17+
18+
with open_and_save(self.dbgym_cfg, order_fpath) as f:
19+
for line in f:
20+
qid, qpath = line.strip().split(",")
21+
qpath = Path(qpath)
22+
assert is_fully_resolved(qpath)
23+
24+
with open_and_save(self.dbgym_cfg, qpath) as qf:
25+
self.queries[qid] = qf.read()
26+
self.query_order.append(qid)
27+
28+
def get_query(self, qid: str) -> str:
29+
return self.queries[qid]
30+
31+
def get_query_order(self) -> list[str]:
32+
return self.query_order
33+
34+
def get_queries_in_order(self) -> list[str]:
35+
return [self.queries[qid] for qid in self.query_order]

scripts/run_protox_e2e_test.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import yaml
99

1010
from benchmark.constants import DEFAULT_SCALE_FACTOR
11+
from benchmark.tpch.constants import DEFAULT_TPCH_SEED
1112
from util.pg import get_is_postgres_running
1213
from util.workspace import (
1314
default_embedder_path,
@@ -72,7 +73,7 @@ def run_e2e_for_benchmark(benchmark_name: str, intended_dbdata_hardware: str) ->
7273
if benchmark_name == "tpch":
7374
scale_factor = 0.01
7475
query_subset = "all"
75-
workload_name_suffix = f"15721_15721_{query_subset}"
76+
workload_name_suffix = f"{DEFAULT_TPCH_SEED}_{DEFAULT_TPCH_SEED}_{query_subset}"
7677
embedding_datagen_args = "--override-sample-limits lineitem,32768"
7778
embedding_train_args = "--iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2"
7879
tune_hpo_args = "--num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01"

util/workspace.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import redis
1616
import yaml
1717

18+
from benchmark.tpch.constants import DEFAULT_TPCH_SEED
1819
from util.log import DBGYM_LOGGER_NAME
1920
from util.shell import subprocess_run
2021

@@ -96,7 +97,7 @@ def get_workload_name(scale_factor: float | str, suffix: str) -> str:
9697

9798
def get_default_workload_name_suffix(benchmark_name: str) -> str:
9899
if benchmark_name == "tpch":
99-
return "15721_15721_all"
100+
return f"{DEFAULT_TPCH_SEED}_{DEFAULT_TPCH_SEED}_all"
100101
if benchmark_name == "job":
101102
return "all"
102103
else:

0 commit comments

Comments
 (0)