Skip to content

Commit

Permalink
Integration test CI (#37)
Browse files Browse the repository at this point in the history
**Summary**: Added a CI for running integration tests. Fixed a few bugs
to get it working.

**Demo**:
[Successful
CI](https://github.com/cmu-db/dbgym/actions/runs/10242135761/job/28331697565)
![Screenshot 2024-08-04 at 21 08
29](https://github.com/user-attachments/assets/f70cbc54-4cf7-473a-a37c-4546e60b63ee)

**Details**:
* Made the runner use the `runner` user instead of `root` so we can
start Postgres in it.
* Made new DBGym config for integration test so that the ports and
workspace don't conflict (yay config file!). This is not currently
necessary since we run it in a Docker container but it's a good
precaution.
* In the integration test, we run the full chain (with replay) twice so
that we test it both without and with cached results in the workspace.
We make sure to delete the workspace before running twice (again not
necessary because we're using Docker but is a good precaution).
  • Loading branch information
wangpatrick57 authored Aug 5, 2024
1 parent e373554 commit 9ef1ee6
Show file tree
Hide file tree
Showing 14 changed files with 130 additions and 106 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Unit Tests
name: Unit and Integration Tests

on:
push: {}
Expand Down Expand Up @@ -26,10 +26,22 @@ jobs:
# Note that the GHA runners are stateful. Dependencies installed from previous runs will still be on the runner.
# This means this step will usually be pretty fast as most dependencies will already be cached. However, it also
# means that past runs might interfere with the current run, so you sometimes may need to restart the GHA runners.

# We need to do `. "$HOME/.cargo/env"` in each step for it to work.
- name: Install dependencies
run: |
./dependencies/install_dependencies.sh
. "$HOME/.cargo/env"
- name: Run unit tests
run: python scripts/run_unittests.py
run: |
. "$HOME/.cargo/env"
python scripts/run_unit_tests.py
- name: Run integration test
# Delete the workspace. Run once with a clean workspace. Run again from the existing workspace.
# Need to run with a non-root user in order to start Postgres.
run: |
. "$HOME/.cargo/env"
rm -rf ../dbgym_integtest_workspace
./scripts/integration_test.sh ssd
./scripts/integration_test.sh ssd
18 changes: 9 additions & 9 deletions benchmark/tpch/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def _clone(dbgym_cfg: DBGymConfig):
f"./tpch_setup.sh {real_build_path}", cwd=dbgym_cfg.cur_source_path()
)
symlink_dpath = link_result(dbgym_cfg, real_build_path / "tpch-kit")
assert os.path.samefile(expected_symlink_dpath, symlink_dpath)
assert expected_symlink_dpath.samefile(symlink_dpath)
benchmark_tpch_logger.info(f"Cloned: {expected_symlink_dpath}")


Expand Down Expand Up @@ -97,7 +97,7 @@ def _generate_queries(dbgym_cfg: DBGymConfig, seed_start: int, seed_end: int, sc
verbose=False,
)
queries_symlink_dpath = link_result(dbgym_cfg, real_dir)
assert os.path.samefile(queries_symlink_dpath, expected_queries_symlink_dpath)
assert queries_symlink_dpath.samefile(expected_queries_symlink_dpath)
benchmark_tpch_logger.info(
f"Generated queries: {data_path} [{seed_start}, {seed_end}]"
)
Expand All @@ -119,7 +119,7 @@ def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float):
subprocess_run(f"mv ./*.tbl {real_dir}", cwd=tpch_kit_dpath / "dbgen")

tables_symlink_dpath = link_result(dbgym_cfg, real_dir)
assert os.path.samefile(tables_symlink_dpath, expected_tables_symlink_dpath)
assert tables_symlink_dpath.samefile(expected_tables_symlink_dpath)
benchmark_tpch_logger.info(f"Generated: {expected_tables_symlink_dpath}")


Expand All @@ -130,12 +130,12 @@ def _generate_workload(
query_subset: str,
scale_factor: float,
):
symlink_data_dir = dbgym_cfg.cur_symlinks_data_path(mkdir=True)
symlink_data_dpath = dbgym_cfg.cur_symlinks_data_path(mkdir=True)
workload_name = workload_name_fn(scale_factor, seed_start, seed_end, query_subset)
expected_workload_symlink_dpath = symlink_data_dir / (workload_name + ".link")
expected_workload_symlink_dpath = symlink_data_dpath / (workload_name + ".link")

benchmark_tpch_logger.info(f"Generating: {expected_workload_symlink_dpath}")
real_dir = dbgym_cfg.cur_task_runs_data_path(
real_dpath = dbgym_cfg.cur_task_runs_data_path(
workload_name, mkdir=True
)

Expand All @@ -147,15 +147,15 @@ def _generate_workload(
elif query_subset == "odd":
queries = [f"{i}" for i in range(1, 22 + 1) if i % 2 == 1]

with open(real_dir / "order.txt", "w") as f:
with open(real_dpath / "order.txt", "w") as f:
for seed in range(seed_start, seed_end + 1):
for qnum in queries:
sql_fpath = (symlink_data_dir / (_get_queries_dname(seed, scale_factor) + ".link")).resolve() / f"{qnum}.sql"
sql_fpath = (symlink_data_dpath / (_get_queries_dname(seed, scale_factor) + ".link")).resolve() / f"{qnum}.sql"
assert sql_fpath.exists() and not sql_fpath.is_symlink() and sql_fpath.is_absolute(), "We should only write existent real absolute paths to a file"
output = ",".join([f"S{seed}-Q{qnum}", str(sql_fpath)])
print(output, file=f)
# TODO(WAN): add option to deep-copy the workload.

workload_symlink_dpath = link_result(dbgym_cfg, real_dir)
workload_symlink_dpath = link_result(dbgym_cfg, real_dpath)
assert workload_symlink_dpath == expected_workload_symlink_dpath
benchmark_tpch_logger.info(f"Generated: {expected_workload_symlink_dpath}")
2 changes: 1 addition & 1 deletion benchmark/tpch/tpch_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ TPCH_REPO_ROOT="$1"
if [ ! -d "${TPCH_REPO_ROOT}/tpch-kit" ]; then
mkdir -p "${TPCH_REPO_ROOT}"
cd "${TPCH_REPO_ROOT}"
git clone git@github.com:lmwnshn/tpch-kit.git --single-branch --branch master --depth 1
git clone https://github.com/lmwnshn/tpch-kit.git --single-branch --branch master --depth 1
cd ./tpch-kit/dbgen
make MACHINE=LINUX DATABASE=POSTGRESQL
fi
File renamed without changes.
4 changes: 2 additions & 2 deletions dbms/postgres/build_repo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ REPO_REAL_PARENT_DPATH="$1"
# Download and make postgres from the boot repository.
mkdir -p "${REPO_REAL_PARENT_DPATH}"
cd "${REPO_REAL_PARENT_DPATH}"
git clone git@github.com:lmwnshn/boot.git --single-branch --branch vldb_2024 --depth 1
git clone https://github.com/lmwnshn/boot.git --single-branch --branch vldb_2024 --depth 1
cd ./boot
./cmudb/build/configure.sh release "${REPO_REAL_PARENT_DPATH}/boot/build/postgres"
make clean
Expand All @@ -25,7 +25,7 @@ make install -j
cd "${REPO_REAL_PARENT_DPATH}/boot"

# Download and make hypopg.
git clone git@github.com:HypoPG/hypopg.git
git clone https://github.com/HypoPG/hypopg.git
cd ./hypopg
PG_CONFIG="${REPO_REAL_PARENT_DPATH}/boot/build/postgres/bin/pg_config" make install
cd "${REPO_REAL_PARENT_DPATH}/boot"
Expand Down
2 changes: 1 addition & 1 deletion dbms/postgres/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild):

# only link at the end so that the link only ever points to a complete repo
repo_symlink_dpath = link_result(dbgym_cfg, repo_real_dpath)
assert os.path.samefile(expected_repo_symlink_dpath, repo_symlink_dpath)
assert expected_repo_symlink_dpath.samefile(repo_symlink_dpath)
dbms_postgres_logger.info(f"Set up repo in {expected_repo_symlink_dpath}")


Expand Down
8 changes: 4 additions & 4 deletions manage/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,11 @@ def clean_workspace(dbgym_cfg: DBGymConfig, mode: str="safe", verbose=False) ->
if not is_child_path(real_fordpath, dbgym_cfg.dbgym_runs_path):
continue

assert not os.path.samefile(real_fordpath, dbgym_cfg.dbgym_runs_path)
assert not real_fordpath.samefile(dbgym_cfg.dbgym_runs_path)

# Figure out the task_run_child_fordpath to put into task_run_child_fordpaths_to_keep
task_run_child_fordpath = None
if os.path.samefile(parent_dpath_of_path(real_fordpath), dbgym_cfg.dbgym_runs_path):
if parent_dpath_of_path(real_fordpath).samefile(dbgym_cfg.dbgym_runs_path):
# While it's true that it shouldn't be possible to symlink to a directory directly in task_runs/,
# we'll just not delete it if the user happens to have one like this. Even if the user messed up
# the structure somehow, it's just a good idea not to delete it.
Expand All @@ -183,10 +183,10 @@ def clean_workspace(dbgym_cfg: DBGymConfig, mode: str="safe", verbose=False) ->
# However, as with above, we won't just nuke files if the workspace doesn't follow this rule for
# some reason.
task_run_child_fordpath = real_fordpath
while not os.path.samefile(parent_dpath_of_path(task_run_child_fordpath), dbgym_cfg.dbgym_runs_path):
while not parent_dpath_of_path(task_run_child_fordpath).samefile(dbgym_cfg.dbgym_runs_path):
task_run_child_fordpath = parent_dpath_of_path(task_run_child_fordpath)
assert task_run_child_fordpath != None
assert os.path.samefile(parent_dpath_of_path(task_run_child_fordpath), dbgym_cfg.dbgym_runs_path), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_cfg.dbgym_runs_path"
assert parent_dpath_of_path(task_run_child_fordpath).samefile(dbgym_cfg.dbgym_runs_path), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_cfg.dbgym_runs_path"
task_run_child_fordpaths_to_keep.add(task_run_child_fordpath)

# If on safe mode, add symlinks inside the task_run_child_fordpath to be processed
Expand Down
Loading

0 comments on commit 9ef1ee6

Please sign in to comment.