Skip to content

Commit 34a47c0

Browse files
Fully using new workspace (#67)
**Summary**: The codebase is now fully refactored to use the new `open_and_save()`, `save_file()`, and `link_result()` functions from #66. The old functions have been removed. **Demo**: Wrote two new integration test files that pass. <img width="905" alt="Screenshot 2024-12-30 at 10 45 53" src="https://github.com/user-attachments/assets/ca197fe1-1a5b-4255-9ec7-515c7687caf1" /> <img width="1076" alt="Screenshot 2024-12-30 at 10 50 10" src="https://github.com/user-attachments/assets/a621b5eb-4619-4e27-a2a7-85c8a8ff501c" /> **Details** * Refactored `dbms/` and `benchmark/` to use the new files. Wrote tests for both too. * Moved path functions from `workspace.py` to `gymlib/symlinks_paths.py`. * This is important because agents will need access to the DBMS/benchmark paths. * Renamed all occurrences of `dpath/fpath/fordpath` to `path`. Renamed `dname/fname` to `dirname/filename`. * Names could refer to conceptual names so we add `dir/file` to the front to disambiguate. * Paths are not ambiguous though so we just call them paths. Whether they're a directory or file matters a little, but I think it's cleaner to just call them paths since that's the general standard.
1 parent 8511688 commit 34a47c0

37 files changed

+1006
-985
lines changed

.github/workflows/tests.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ jobs:
5252
# Integration tests do require external systems to be running (most commonly a database instance).
5353
# Unlike end-to-end tests though, they test a specific module in a detailed manner, much like a unit test does.
5454
env:
55-
# We set `INTENDED_DBDATA_HARDWARE` so that it's seen when `integtest_pg_conn.py` executes `_set_up_gymlib_integtest_workspace.sh`.
55+
# The CI runs on ssd so we have to set this.
5656
INTENDED_DBDATA_HARDWARE: ssd
5757
run: |
5858
. "$HOME/.cargo/env"

benchmark/cli.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
@click.group(name="benchmark")
99
@click.pass_obj
1010
def benchmark_group(dbgym_workspace: DBGymWorkspace) -> None:
11-
dbgym_workspace.append_group("benchmark")
11+
pass
1212

1313

1414
benchmark_group.add_command(tpch_group)

benchmark/job/cli.py

+74-51
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,17 @@
22
from typing import Optional
33

44
import click
5+
from gymlib.symlinks_paths import (
6+
get_tables_dirname,
7+
get_workload_dirname,
8+
get_workload_suffix,
9+
name_to_linkname,
10+
)
511

612
from benchmark.constants import DEFAULT_SCALE_FACTOR
713
from util.log import DBGYM_LOGGER_NAME
814
from util.shell import subprocess_run
9-
from util.workspace import (
10-
DBGymWorkspace,
11-
get_default_tables_dname,
12-
get_workload_name,
13-
is_fully_resolved,
14-
link_result,
15-
)
15+
from util.workspace import DBGymWorkspace, fully_resolve_path
1616

1717
JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz"
1818
JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz"
@@ -137,18 +137,22 @@
137137
@click.group(name="job")
138138
@click.pass_obj
139139
def job_group(dbgym_workspace: DBGymWorkspace) -> None:
140-
dbgym_workspace.append_group("job")
140+
pass
141141

142142

143-
@job_group.command(name="data")
143+
@job_group.command(name="tables")
144144
# We expose this option to keep its interface consistent with other workloads, but you should never pass in something other than DEFAULT_SCALE_FACTOR.
145145
@click.argument("scale-factor", type=float)
146146
@click.pass_obj
147-
# The reason generate data is separate from create dbdata is because generate-data is generic
147+
# The reason generate data is separate from create dbdata is because generate data is generic
148148
# to all DBMSs while create dbdata is specific to a single DBMS.
149-
def job_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
149+
def job_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
150+
_job_tables(dbgym_workspace, scale_factor)
151+
152+
153+
def _job_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
150154
assert scale_factor == DEFAULT_SCALE_FACTOR
151-
_download_job_data(dbgym_workspace)
155+
_download_job_tables(dbgym_workspace)
152156

153157

154158
@job_group.command(name="workload")
@@ -161,18 +165,24 @@ def job_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
161165
@click.pass_obj
162166
def job_workload(
163167
dbgym_workspace: DBGymWorkspace, query_subset: str, scale_factor: float
168+
) -> None:
169+
_job_workload(dbgym_workspace, query_subset, scale_factor)
170+
171+
172+
def _job_workload(
173+
dbgym_workspace: DBGymWorkspace, query_subset: str, scale_factor: float
164174
) -> None:
165175
assert scale_factor == DEFAULT_SCALE_FACTOR
166176
_download_job_queries(dbgym_workspace)
167177
_generate_job_workload(dbgym_workspace, query_subset)
168178

169179

170-
def _download_job_data(dbgym_workspace: DBGymWorkspace) -> None:
180+
def _download_job_tables(dbgym_workspace: DBGymWorkspace) -> None:
171181
_download_and_untar_dir(
172182
dbgym_workspace,
173183
JOB_TABLES_URL,
174184
"imdb.tgz",
175-
get_default_tables_dname(DEFAULT_SCALE_FACTOR),
185+
get_tables_dirname("job", DEFAULT_SCALE_FACTOR),
176186
)
177187

178188

@@ -199,51 +209,66 @@ def _download_and_untar_dir(
199209
an "original" directory name. If this is the case, you should set
200210
`untarred_original_dname` to ensure that it gets renamed to `untarred_dname`.
201211
"""
202-
expected_symlink_dpath = (
203-
dbgym_workspace.cur_symlinks_data_path(mkdir=True) / f"{untarred_dname}.link"
212+
expected_symlink_path = (
213+
dbgym_workspace.dbgym_cur_symlinks_path / f"{untarred_dname}.link"
204214
)
205-
if expected_symlink_dpath.exists():
215+
if expected_symlink_path.exists():
206216
logging.getLogger(DBGYM_LOGGER_NAME).info(
207-
f"Skipping download: {expected_symlink_dpath}"
217+
f"Skipping download: {expected_symlink_path}"
208218
)
209219
return
210220

211-
logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_dpath}")
212-
real_data_path = dbgym_workspace.cur_task_runs_data_path(mkdir=True)
213-
subprocess_run(f"curl -O {download_url}", cwd=real_data_path)
214-
untarred_data_dpath = dbgym_workspace.cur_task_runs_data_path(untarred_dname)
221+
logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_path}")
222+
subprocess_run(f"curl -O {download_url}", cwd=dbgym_workspace.dbgym_this_run_path)
223+
untarred_data_path = dbgym_workspace.dbgym_this_run_path / untarred_dname
215224

216225
if untarred_original_dname is not None:
217-
assert not untarred_data_dpath.exists()
218-
subprocess_run(f"tar -zxvf {download_tarred_fname}", cwd=real_data_path)
219-
assert (real_data_path / untarred_original_dname).exists()
226+
assert not untarred_data_path.exists()
227+
subprocess_run(
228+
f"tar -zxvf {download_tarred_fname}",
229+
cwd=dbgym_workspace.dbgym_this_run_path,
230+
)
231+
assert (dbgym_workspace.dbgym_this_run_path / untarred_original_dname).exists()
220232
subprocess_run(
221-
f"mv {untarred_original_dname} {untarred_dname}", cwd=real_data_path
233+
f"mv {untarred_original_dname} {untarred_dname}",
234+
cwd=dbgym_workspace.dbgym_this_run_path,
222235
)
223236
else:
224-
untarred_data_dpath.mkdir(parents=True, exist_ok=False)
225-
subprocess_run(f"tar -zxvf ../{download_tarred_fname}", cwd=untarred_data_dpath)
237+
untarred_data_path.mkdir(parents=True, exist_ok=False)
238+
subprocess_run(f"tar -zxvf ../{download_tarred_fname}", cwd=untarred_data_path)
226239

227-
assert untarred_data_dpath.exists()
228-
subprocess_run(f"rm {download_tarred_fname}", cwd=real_data_path)
229-
symlink_dpath = link_result(dbgym_workspace, untarred_data_dpath)
230-
assert expected_symlink_dpath.samefile(symlink_dpath)
231-
logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloaded: {expected_symlink_dpath}")
240+
assert untarred_data_path.exists()
241+
subprocess_run(
242+
f"rm {download_tarred_fname}", cwd=dbgym_workspace.dbgym_this_run_path
243+
)
244+
symlink_path = dbgym_workspace.link_result(untarred_data_path)
245+
assert expected_symlink_path.samefile(symlink_path)
246+
logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloaded: {expected_symlink_path}")
232247

233248

234249
def _generate_job_workload(
235250
dbgym_workspace: DBGymWorkspace,
236251
query_subset: str,
237252
) -> None:
238-
workload_name = get_workload_name(DEFAULT_SCALE_FACTOR, query_subset)
239-
expected_workload_symlink_dpath = dbgym_workspace.cur_symlinks_data_path(
240-
mkdir=True
241-
) / (workload_name + ".link")
253+
workload_name = get_workload_dirname(
254+
"job",
255+
DEFAULT_SCALE_FACTOR,
256+
get_workload_suffix("job", query_subset=query_subset),
257+
)
258+
expected_workload_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
259+
name_to_linkname(workload_name)
260+
)
261+
if expected_workload_symlink_path.exists():
262+
logging.getLogger(DBGYM_LOGGER_NAME).info(
263+
f"Skipping generation: {expected_workload_symlink_path}"
264+
)
265+
return
242266

243267
logging.getLogger(DBGYM_LOGGER_NAME).info(
244-
f"Generating: {expected_workload_symlink_dpath}"
268+
f"Generating: {expected_workload_symlink_path}"
245269
)
246-
real_dpath = dbgym_workspace.cur_task_runs_data_path(workload_name, mkdir=True)
270+
workload_path = dbgym_workspace.dbgym_this_run_path / workload_name
271+
workload_path.mkdir(parents=False, exist_ok=False)
247272

248273
query_names = None
249274
if query_subset == "all":
@@ -255,19 +280,17 @@ def _generate_job_workload(
255280
else:
256281
assert False
257282

258-
with open(real_dpath / "order.txt", "w") as f:
283+
with open(workload_path / "order.txt", "w") as f:
284+
queries_parent_path = dbgym_workspace.dbgym_cur_symlinks_path / (
285+
name_to_linkname(JOB_QUERIES_DNAME)
286+
)
287+
259288
for qname in query_names:
260-
sql_fpath = (
261-
dbgym_workspace.cur_symlinks_data_path(mkdir=True)
262-
/ (f"{JOB_QUERIES_DNAME}.link")
263-
).resolve() / f"{qname}.sql"
264-
assert is_fully_resolved(
265-
sql_fpath
266-
), "We should only write existent real absolute paths to a file"
267-
f.write(f"Q{qname},{sql_fpath}\n")
289+
sql_path = fully_resolve_path(queries_parent_path / f"{qname}.sql")
290+
f.write(f"Q{qname},{sql_path}\n")
268291

269-
workload_symlink_dpath = link_result(dbgym_workspace, real_dpath)
270-
assert workload_symlink_dpath == expected_workload_symlink_dpath
292+
workload_symlink_path = dbgym_workspace.link_result(workload_path)
293+
assert workload_symlink_path == expected_workload_symlink_path
271294
logging.getLogger(DBGYM_LOGGER_NAME).info(
272-
f"Generated: {expected_workload_symlink_dpath}"
295+
f"Generated: {expected_workload_symlink_path}"
273296
)

benchmark/job/load_info.py

+20-29
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
from pathlib import Path
22
from typing import Optional
33

4+
from gymlib.symlinks_paths import get_tables_symlink_path
5+
46
from benchmark.constants import DEFAULT_SCALE_FACTOR
57
from dbms.load_info_base_class import LoadInfoBaseClass
6-
from util.workspace import DBGymWorkspace, get_default_tables_dname, is_fully_resolved
8+
from util.workspace import DBGymWorkspace, fully_resolve_path
79

810
JOB_SCHEMA_FNAME = "job_schema.sql"
911

1012

1113
class JobLoadInfo(LoadInfoBaseClass):
12-
CODEBASE_PATH_COMPONENTS = ["dbgym", "benchmark", "job"]
13-
CODEBASE_DNAME = "_".join(CODEBASE_PATH_COMPONENTS)
1414
TABLES = [
1515
"aka_name",
1616
"aka_title",
@@ -36,43 +36,34 @@ class JobLoadInfo(LoadInfoBaseClass):
3636
]
3737

3838
def __init__(self, dbgym_workspace: DBGymWorkspace):
39-
# schema and constraints
40-
schema_root_dpath = dbgym_workspace.base_dbgym_repo_dpath
41-
for component in JobLoadInfo.CODEBASE_PATH_COMPONENTS[
42-
1:
43-
]: # [1:] to skip "dbgym"
44-
schema_root_dpath /= component
45-
self._schema_fpath = schema_root_dpath / JOB_SCHEMA_FNAME
39+
# Schema (directly in the codebase).
40+
job_codebase_path = dbgym_workspace.base_dbgym_repo_path / "benchmark" / "job"
41+
self._schema_path = job_codebase_path / JOB_SCHEMA_FNAME
4642
assert (
47-
self._schema_fpath.exists()
48-
), f"self._schema_fpath ({self._schema_fpath}) does not exist"
43+
self._schema_path.exists()
44+
), f"self._schema_path ({self._schema_path}) does not exist"
4945

5046
# Tables
51-
data_root_dpath = (
52-
dbgym_workspace.dbgym_symlinks_path / JobLoadInfo.CODEBASE_DNAME / "data"
53-
)
54-
tables_symlink_dpath = (
55-
data_root_dpath / f"{get_default_tables_dname(DEFAULT_SCALE_FACTOR)}.link"
47+
tables_path = fully_resolve_path(
48+
get_tables_symlink_path(
49+
dbgym_workspace.dbgym_workspace_path, "job", DEFAULT_SCALE_FACTOR
50+
)
5651
)
57-
tables_dpath = tables_symlink_dpath.resolve()
58-
assert is_fully_resolved(
59-
tables_dpath
60-
), f"tables_dpath ({tables_dpath}) should be an existent real absolute path. Make sure you have generated the TPC-H data"
61-
self._tables_and_fpaths = []
52+
self._tables_and_paths = []
6253
for table in JobLoadInfo.TABLES:
63-
table_fpath = tables_dpath / f"{table}.csv"
64-
self._tables_and_fpaths.append((table, table_fpath))
54+
table_path = tables_path / f"{table}.csv"
55+
self._tables_and_paths.append((table, table_path))
6556

66-
def get_schema_fpath(self) -> Path:
67-
return self._schema_fpath
57+
def get_schema_path(self) -> Path:
58+
return self._schema_path
6859

69-
def get_tables_and_fpaths(self) -> list[tuple[str, Path]]:
70-
return self._tables_and_fpaths
60+
def get_tables_and_paths(self) -> list[tuple[str, Path]]:
61+
return self._tables_and_paths
7162

7263
def get_table_file_delimiter(self) -> str:
7364
return ","
7465

75-
def get_constraints_fpath(self) -> Optional[Path]:
66+
def get_constraints_path(self) -> Optional[Path]:
7667
# JOB does not have any constraints. It does have indexes, but we don't want to create
7768
# those indexes so that the tuning agent can start from a clean slate.
7869
return None

benchmark/tests/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
dbgym_workspace_path: ../dbgym_benchmark_integtest_workspace/

0 commit comments

Comments
 (0)