Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Workspace refactor #66

Merged
merged 43 commits into from
Dec 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
37b423a
DBGymConfig -> DBGymWorkspace
wangpatrick57 Dec 26, 2024
58937bf
fmt
wangpatrick57 Dec 26, 2024
b6d580c
renamed IntegtestWorkspace
wangpatrick57 Dec 26, 2024
2a9b081
shortened GymlibIntegtestWorkspaceManager
wangpatrick57 Dec 26, 2024
a98ca86
moved benchmark and scale factor into class
wangpatrick57 Dec 26, 2024
61080fb
now taking in benchmark and scale factor as envvars
wangpatrick57 Dec 26, 2024
9a89bb1
DBGymWorkspace now only takes in the workspace path in init
wangpatrick57 Dec 26, 2024
7090628
wrote get_base_dbgym_repo_dpath function
wangpatrick57 Dec 26, 2024
ab73156
fully_resolve_path no longer takes in dbgym_workspace
wangpatrick57 Dec 26, 2024
3add664
get_workspace_path_from_config no longer calls fully_resolve_path
wangpatrick57 Dec 26, 2024
6ca5cb2
comments
wangpatrick57 Dec 26, 2024
910c218
fmt
wangpatrick57 Dec 26, 2024
723a72a
moved env tests to env/tests/
wangpatrick57 Dec 26, 2024
2a5ead1
DBGYM_CONFIG_PATH is now only set in one place
wangpatrick57 Dec 26, 2024
71495df
del unused
wangpatrick57 Dec 26, 2024
9845d22
unittest_workspace basics
wangpatrick57 Dec 27, 2024
d4c3206
refactored clean unittests to have a workspace in the scratch space (…
wangpatrick57 Dec 27, 2024
3d93e56
test_workspace_init
wangpatrick57 Dec 27, 2024
dc1bfba
get_workspace_init_structure
wangpatrick57 Dec 27, 2024
c0772aa
added more workspace init tests
wangpatrick57 Dec 27, 2024
d3514f8
renamed integtest_util to gymlib_integtest_util
wangpatrick57 Dec 27, 2024
bf6e8f7
moved structure util to its own file (to avoid running cleantests twi…
wangpatrick57 Dec 27, 2024
0e77451
gitignore
wangpatrick57 Dec 27, 2024
316ca02
get_updated_structure_from_workspace_init
wangpatrick57 Dec 27, 2024
88f31b3
test_link_result_basic_functionality
wangpatrick57 Dec 27, 2024
3f95632
refactored workspace tests to use helpers
wangpatrick57 Dec 27, 2024
e65b0d2
test_link_result_invalid_custom_link_name and test_link_result_valid_…
wangpatrick57 Dec 27, 2024
c3e17f4
fmt and mypy
wangpatrick57 Dec 27, 2024
fc2f424
test_link_same_result_with_different_name
wangpatrick57 Dec 27, 2024
905a65c
got rid of link_result_helper
wangpatrick57 Dec 27, 2024
1484715
test_link_result_does_not_copy_directory_structure_to_symlinks_dir
wangpatrick57 Dec 27, 2024
d9603eb
fmt and mypy
wangpatrick57 Dec 27, 2024
4dde8e6
test_link_result_from_another_run_raises_error
wangpatrick57 Dec 28, 2024
7381e07
test_link_result_from_external_dir_raises_error and make_file_helper
wangpatrick57 Dec 28, 2024
f1e56fc
test_link_result_cannot_link_symlink
wangpatrick57 Dec 28, 2024
8e5e5f2
link result tests done
wangpatrick57 Dec 28, 2024
1a0d947
now passing test_save_file_dependency
wangpatrick57 Dec 28, 2024
ddefc2b
test_save_file_config
wangpatrick57 Dec 28, 2024
f558a42
test_save_file_generated_this_run_raises_error
wangpatrick57 Dec 28, 2024
f0e660f
wrote save same file twice tests
wangpatrick57 Dec 28, 2024
7d9ee75
can now write file contents when creating file
wangpatrick57 Dec 28, 2024
8fa8ac0
fmt and mypy
wangpatrick57 Dec 28, 2024
36d1c68
done with two file save tests and code
wangpatrick57 Dec 29, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
# Integration tests do require external systems to be running (most commonly a database instance).
# Unlike end-to-end tests though, they test a specific module in a detailed manner, much like a unit test does.
env:
# We set `INTENDED_DBDATA_HARDWARE` so that it's seen when `integtest_pg_conn.py` executes `./env/set_up_env_integtests.sh`.
# We set `INTENDED_DBDATA_HARDWARE` so that it's seen when `integtest_pg_conn.py` executes `_set_up_gymlib_integtest_workspace.sh`.
INTENDED_DBDATA_HARDWARE: ssd
run: |
. "$HOME/.cargo/env"
Expand Down
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ __pycache__/
.conda/
.idea/
build/
test_clean_scratchspace/
*_scratchspace/
workspace/
default_*_benchbase_config_*.xml
*.egg-info/
*.egg-info/
*.code-workspace
6 changes: 3 additions & 3 deletions benchmark/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

from benchmark.job.cli import job_group
from benchmark.tpch.cli import tpch_group
from util.workspace import DBGymConfig
from util.workspace import DBGymWorkspace


@click.group(name="benchmark")
@click.pass_obj
def benchmark_group(dbgym_cfg: DBGymConfig) -> None:
dbgym_cfg.append_group("benchmark")
def benchmark_group(dbgym_workspace: DBGymWorkspace) -> None:
dbgym_workspace.append_group("benchmark")


benchmark_group.add_command(tpch_group)
Expand Down
48 changes: 24 additions & 24 deletions benchmark/job/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from util.log import DBGYM_LOGGER_NAME
from util.shell import subprocess_run
from util.workspace import (
DBGymConfig,
DBGymWorkspace,
get_default_tables_dname,
get_workload_name,
is_fully_resolved,
Expand Down Expand Up @@ -136,8 +136,8 @@

@click.group(name="job")
@click.pass_obj
def job_group(dbgym_cfg: DBGymConfig) -> None:
dbgym_cfg.append_group("job")
def job_group(dbgym_workspace: DBGymWorkspace) -> None:
dbgym_workspace.append_group("job")


@job_group.command(name="data")
Expand All @@ -146,9 +146,9 @@ def job_group(dbgym_cfg: DBGymConfig) -> None:
@click.pass_obj
# The reason generate data is separate from create dbdata is because generate-data is generic
# to all DBMSs while create dbdata is specific to a single DBMS.
def job_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None:
def job_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
assert scale_factor == DEFAULT_SCALE_FACTOR
_download_job_data(dbgym_cfg)
_download_job_data(dbgym_workspace)


@job_group.command(name="workload")
Expand All @@ -160,25 +160,25 @@ def job_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None:
@click.option("--scale-factor", type=float, default=DEFAULT_SCALE_FACTOR)
@click.pass_obj
def job_workload(
dbgym_cfg: DBGymConfig, query_subset: str, scale_factor: float
dbgym_workspace: DBGymWorkspace, query_subset: str, scale_factor: float
) -> None:
assert scale_factor == DEFAULT_SCALE_FACTOR
_download_job_queries(dbgym_cfg)
_generate_job_workload(dbgym_cfg, query_subset)
_download_job_queries(dbgym_workspace)
_generate_job_workload(dbgym_workspace, query_subset)


def _download_job_data(dbgym_cfg: DBGymConfig) -> None:
def _download_job_data(dbgym_workspace: DBGymWorkspace) -> None:
_download_and_untar_dir(
dbgym_cfg,
dbgym_workspace,
JOB_TABLES_URL,
"imdb.tgz",
get_default_tables_dname(DEFAULT_SCALE_FACTOR),
)


def _download_job_queries(dbgym_cfg: DBGymConfig) -> None:
def _download_job_queries(dbgym_workspace: DBGymWorkspace) -> None:
_download_and_untar_dir(
dbgym_cfg,
dbgym_workspace,
JOB_QUERIES_URL,
"job.tgz",
JOB_QUERIES_DNAME,
Expand All @@ -187,7 +187,7 @@ def _download_job_queries(dbgym_cfg: DBGymConfig) -> None:


def _download_and_untar_dir(
dbgym_cfg: DBGymConfig,
dbgym_workspace: DBGymWorkspace,
download_url: str,
download_tarred_fname: str,
untarred_dname: str,
Expand All @@ -200,7 +200,7 @@ def _download_and_untar_dir(
`untarred_original_dname` to ensure that it gets renamed to `untarred_dname`.
"""
expected_symlink_dpath = (
dbgym_cfg.cur_symlinks_data_path(mkdir=True) / f"{untarred_dname}.link"
dbgym_workspace.cur_symlinks_data_path(mkdir=True) / f"{untarred_dname}.link"
)
if expected_symlink_dpath.exists():
logging.getLogger(DBGYM_LOGGER_NAME).info(
Expand All @@ -209,9 +209,9 @@ def _download_and_untar_dir(
return

logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_dpath}")
real_data_path = dbgym_cfg.cur_task_runs_data_path(mkdir=True)
real_data_path = dbgym_workspace.cur_task_runs_data_path(mkdir=True)
subprocess_run(f"curl -O {download_url}", cwd=real_data_path)
untarred_data_dpath = dbgym_cfg.cur_task_runs_data_path(untarred_dname)
untarred_data_dpath = dbgym_workspace.cur_task_runs_data_path(untarred_dname)

if untarred_original_dname is not None:
assert not untarred_data_dpath.exists()
Expand All @@ -226,24 +226,24 @@ def _download_and_untar_dir(

assert untarred_data_dpath.exists()
subprocess_run(f"rm {download_tarred_fname}", cwd=real_data_path)
symlink_dpath = link_result(dbgym_cfg, untarred_data_dpath)
symlink_dpath = link_result(dbgym_workspace, untarred_data_dpath)
assert expected_symlink_dpath.samefile(symlink_dpath)
logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloaded: {expected_symlink_dpath}")


def _generate_job_workload(
dbgym_cfg: DBGymConfig,
dbgym_workspace: DBGymWorkspace,
query_subset: str,
) -> None:
workload_name = get_workload_name(DEFAULT_SCALE_FACTOR, query_subset)
expected_workload_symlink_dpath = dbgym_cfg.cur_symlinks_data_path(mkdir=True) / (
workload_name + ".link"
)
expected_workload_symlink_dpath = dbgym_workspace.cur_symlinks_data_path(
mkdir=True
) / (workload_name + ".link")

logging.getLogger(DBGYM_LOGGER_NAME).info(
f"Generating: {expected_workload_symlink_dpath}"
)
real_dpath = dbgym_cfg.cur_task_runs_data_path(workload_name, mkdir=True)
real_dpath = dbgym_workspace.cur_task_runs_data_path(workload_name, mkdir=True)

query_names = None
if query_subset == "all":
Expand All @@ -258,15 +258,15 @@ def _generate_job_workload(
with open(real_dpath / "order.txt", "w") as f:
for qname in query_names:
sql_fpath = (
dbgym_cfg.cur_symlinks_data_path(mkdir=True)
dbgym_workspace.cur_symlinks_data_path(mkdir=True)
/ (f"{JOB_QUERIES_DNAME}.link")
).resolve() / f"{qname}.sql"
assert is_fully_resolved(
sql_fpath
), "We should only write existent real absolute paths to a file"
f.write(f"Q{qname},{sql_fpath}\n")

workload_symlink_dpath = link_result(dbgym_cfg, real_dpath)
workload_symlink_dpath = link_result(dbgym_workspace, real_dpath)
assert workload_symlink_dpath == expected_workload_symlink_dpath
logging.getLogger(DBGYM_LOGGER_NAME).info(
f"Generated: {expected_workload_symlink_dpath}"
Expand Down
8 changes: 4 additions & 4 deletions benchmark/job/load_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from benchmark.constants import DEFAULT_SCALE_FACTOR
from dbms.load_info_base_class import LoadInfoBaseClass
from util.workspace import DBGymConfig, get_default_tables_dname, is_fully_resolved
from util.workspace import DBGymWorkspace, get_default_tables_dname, is_fully_resolved

JOB_SCHEMA_FNAME = "job_schema.sql"

Expand Down Expand Up @@ -35,9 +35,9 @@ class JobLoadInfo(LoadInfoBaseClass):
"title",
]

def __init__(self, dbgym_cfg: DBGymConfig):
def __init__(self, dbgym_workspace: DBGymWorkspace):
# schema and constraints
schema_root_dpath = dbgym_cfg.dbgym_repo_path
schema_root_dpath = dbgym_workspace.base_dbgym_repo_dpath
for component in JobLoadInfo.CODEBASE_PATH_COMPONENTS[
1:
]: # [1:] to skip "dbgym"
Expand All @@ -49,7 +49,7 @@ def __init__(self, dbgym_cfg: DBGymConfig):

# Tables
data_root_dpath = (
dbgym_cfg.dbgym_symlinks_path / JobLoadInfo.CODEBASE_DNAME / "data"
dbgym_workspace.dbgym_symlinks_path / JobLoadInfo.CODEBASE_DNAME / "data"
)
tables_symlink_dpath = (
data_root_dpath / f"{get_default_tables_dname(DEFAULT_SCALE_FACTOR)}.link"
Expand Down
66 changes: 35 additions & 31 deletions benchmark/tpch/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from util.log import DBGYM_LOGGER_NAME
from util.shell import subprocess_run
from util.workspace import (
DBGymConfig,
DBGymWorkspace,
get_default_tables_dname,
get_scale_factor_string,
get_workload_name,
Expand All @@ -19,18 +19,18 @@

@click.group(name="tpch")
@click.pass_obj
def tpch_group(dbgym_cfg: DBGymConfig) -> None:
dbgym_cfg.append_group("tpch")
def tpch_group(dbgym_workspace: DBGymWorkspace) -> None:
dbgym_workspace.append_group("tpch")


@tpch_group.command(name="data")
@click.argument("scale-factor", type=float)
@click.pass_obj
# The reason generate data is separate from create dbdata is because generate-data is generic
# to all DBMSs while create dbdata is specific to a single DBMS.
def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None:
_clone_tpch_kit(dbgym_cfg)
_generate_data(dbgym_cfg, scale_factor)
def tpch_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
_clone_tpch_kit(dbgym_workspace)
_generate_data(dbgym_workspace, scale_factor)


@tpch_group.command(name="workload")
Expand All @@ -54,7 +54,7 @@ def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None:
@click.option("--scale-factor", type=float, default=DEFAULT_SCALE_FACTOR)
@click.pass_obj
def tpch_workload(
dbgym_cfg: DBGymConfig,
dbgym_workspace: DBGymWorkspace,
seed_start: int,
seed_end: int,
query_subset: str,
Expand All @@ -63,18 +63,20 @@ def tpch_workload(
assert (
seed_start <= seed_end
), f"seed_start ({seed_start}) must be <= seed_end ({seed_end})"
_clone_tpch_kit(dbgym_cfg)
_generate_tpch_queries(dbgym_cfg, seed_start, seed_end, scale_factor)
_generate_tpch_workload(dbgym_cfg, seed_start, seed_end, query_subset, scale_factor)
_clone_tpch_kit(dbgym_workspace)
_generate_tpch_queries(dbgym_workspace, seed_start, seed_end, scale_factor)
_generate_tpch_workload(
dbgym_workspace, seed_start, seed_end, query_subset, scale_factor
)


def _get_queries_dname(seed: int, scale_factor: float) -> str:
return f"queries_{seed}_sf{get_scale_factor_string(scale_factor)}"


def _clone_tpch_kit(dbgym_cfg: DBGymConfig) -> None:
def _clone_tpch_kit(dbgym_workspace: DBGymWorkspace) -> None:
expected_symlink_dpath = (
dbgym_cfg.cur_symlinks_build_path(mkdir=True) / "tpch-kit.link"
dbgym_workspace.cur_symlinks_build_path(mkdir=True) / "tpch-kit.link"
)
if expected_symlink_dpath.exists():
logging.getLogger(DBGYM_LOGGER_NAME).info(
Expand All @@ -83,26 +85,28 @@ def _clone_tpch_kit(dbgym_cfg: DBGymConfig) -> None:
return

logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloning: {expected_symlink_dpath}")
real_build_path = dbgym_cfg.cur_task_runs_build_path()
real_build_path = dbgym_workspace.cur_task_runs_build_path()
subprocess_run(
f"./clone_tpch_kit.sh {real_build_path}", cwd=dbgym_cfg.cur_source_path()
f"./clone_tpch_kit.sh {real_build_path}", cwd=dbgym_workspace.cur_source_path()
)
symlink_dpath = link_result(dbgym_cfg, real_build_path / "tpch-kit")
symlink_dpath = link_result(dbgym_workspace, real_build_path / "tpch-kit")
assert expected_symlink_dpath.samefile(symlink_dpath)
logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloned: {expected_symlink_dpath}")


def _get_tpch_kit_dpath(dbgym_cfg: DBGymConfig) -> Path:
tpch_kit_dpath = (dbgym_cfg.cur_symlinks_build_path() / "tpch-kit.link").resolve()
def _get_tpch_kit_dpath(dbgym_workspace: DBGymWorkspace) -> Path:
tpch_kit_dpath = (
dbgym_workspace.cur_symlinks_build_path() / "tpch-kit.link"
).resolve()
assert is_fully_resolved(tpch_kit_dpath)
return tpch_kit_dpath


def _generate_tpch_queries(
dbgym_cfg: DBGymConfig, seed_start: int, seed_end: int, scale_factor: float
dbgym_workspace: DBGymWorkspace, seed_start: int, seed_end: int, scale_factor: float
) -> None:
tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_cfg)
data_path = dbgym_cfg.cur_symlinks_data_path(mkdir=True)
tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_workspace)
data_path = dbgym_workspace.cur_symlinks_data_path(mkdir=True)
logging.getLogger(DBGYM_LOGGER_NAME).info(
f"Generating queries: {data_path} [{seed_start}, {seed_end}]"
)
Expand All @@ -113,7 +117,7 @@ def _generate_tpch_queries(
if expected_queries_symlink_dpath.exists():
continue

real_dir = dbgym_cfg.cur_task_runs_data_path(
real_dir = dbgym_workspace.cur_task_runs_data_path(
_get_queries_dname(seed, scale_factor), mkdir=True
)
for i in range(1, NUM_TPCH_QUERIES + 1):
Expand All @@ -123,16 +127,16 @@ def _generate_tpch_queries(
cwd=tpch_kit_dpath / "dbgen",
verbose=False,
)
queries_symlink_dpath = link_result(dbgym_cfg, real_dir)
queries_symlink_dpath = link_result(dbgym_workspace, real_dir)
assert queries_symlink_dpath.samefile(expected_queries_symlink_dpath)
logging.getLogger(DBGYM_LOGGER_NAME).info(
f"Generated queries: {data_path} [{seed_start}, {seed_end}]"
)


def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None:
tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_cfg)
data_path = dbgym_cfg.cur_symlinks_data_path(mkdir=True)
def _generate_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_workspace)
data_path = dbgym_workspace.cur_symlinks_data_path(mkdir=True)
expected_tables_symlink_dpath = (
data_path / f"{get_default_tables_dname(scale_factor)}.link"
)
Expand All @@ -146,26 +150,26 @@ def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None:
f"Generating: {expected_tables_symlink_dpath}"
)
subprocess_run(f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_dpath / "dbgen")
real_dir = dbgym_cfg.cur_task_runs_data_path(
real_dir = dbgym_workspace.cur_task_runs_data_path(
get_default_tables_dname(scale_factor), mkdir=True
)
subprocess_run(f"mv ./*.tbl {real_dir}", cwd=tpch_kit_dpath / "dbgen")

tables_symlink_dpath = link_result(dbgym_cfg, real_dir)
tables_symlink_dpath = link_result(dbgym_workspace, real_dir)
assert tables_symlink_dpath.samefile(expected_tables_symlink_dpath)
logging.getLogger(DBGYM_LOGGER_NAME).info(
f"Generated: {expected_tables_symlink_dpath}"
)


def _generate_tpch_workload(
dbgym_cfg: DBGymConfig,
dbgym_workspace: DBGymWorkspace,
seed_start: int,
seed_end: int,
query_subset: str,
scale_factor: float,
) -> None:
symlink_data_dpath = dbgym_cfg.cur_symlinks_data_path(mkdir=True)
symlink_data_dpath = dbgym_workspace.cur_symlinks_data_path(mkdir=True)
workload_name = get_workload_name(
scale_factor, f"{seed_start}_{seed_end}_{query_subset}"
)
Expand All @@ -174,7 +178,7 @@ def _generate_tpch_workload(
logging.getLogger(DBGYM_LOGGER_NAME).info(
f"Generating: {expected_workload_symlink_dpath}"
)
real_dpath = dbgym_cfg.cur_task_runs_data_path(workload_name, mkdir=True)
real_dpath = dbgym_workspace.cur_task_runs_data_path(workload_name, mkdir=True)

query_names = None
if query_subset == "all":
Expand All @@ -199,7 +203,7 @@ def _generate_tpch_workload(
f.write(f"S{seed}-Q{qname},{sql_fpath}\n")
# TODO(WAN): add option to deep-copy the workload.

workload_symlink_dpath = link_result(dbgym_cfg, real_dpath)
workload_symlink_dpath = link_result(dbgym_workspace, real_dpath)
assert workload_symlink_dpath == expected_workload_symlink_dpath
logging.getLogger(DBGYM_LOGGER_NAME).info(
f"Generated: {expected_workload_symlink_dpath}"
Expand Down
Loading
Loading