diff --git a/.github/workflows/unittest_ci.yml b/.github/workflows/tests_ci.yml similarity index 68% rename from .github/workflows/unittest_ci.yml rename to .github/workflows/tests_ci.yml index 0c088c98..f2f151f9 100644 --- a/.github/workflows/unittest_ci.yml +++ b/.github/workflows/tests_ci.yml @@ -1,4 +1,4 @@ -name: Unit Tests +name: Unit and Integration Tests on: push: {} @@ -26,10 +26,22 @@ jobs: # Note that the GHA runners are stateful. Dependencies installed from previous runs will still be on the runner. # This means this step will usually be pretty fast as most dependencies will already be cached. However, it also # means that past runs might interfere with the current run, so you sometimes may need to restart the GHA runners. + + # We need to do `. "$HOME/.cargo/env"` in each step for it to work. - name: Install dependencies run: | ./dependencies/install_dependencies.sh - . "$HOME/.cargo/env" - name: Run unit tests - run: python scripts/run_unittests.py + run: | + . "$HOME/.cargo/env" + python scripts/run_unit_tests.py + + - name: Run integration test + # Delete the workspace. Run once with a clean workspace. Run again from the existing workspace. + # Need to run with a non-root user in order to start Postgres. + run: | + . "$HOME/.cargo/env" + rm -rf ../dbgym_integtest_workspace + ./scripts/integration_test.sh ssd + ./scripts/integration_test.sh ssd diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py index 82adeff5..ac9a11a4 100644 --- a/benchmark/tpch/cli.py +++ b/benchmark/tpch/cli.py @@ -67,7 +67,7 @@ def _clone(dbgym_cfg: DBGymConfig): f"./tpch_setup.sh {real_build_path}", cwd=dbgym_cfg.cur_source_path() ) symlink_dpath = link_result(dbgym_cfg, real_build_path / "tpch-kit") - assert os.path.samefile(expected_symlink_dpath, symlink_dpath) + assert expected_symlink_dpath.samefile(symlink_dpath) benchmark_tpch_logger.info(f"Cloned: {expected_symlink_dpath}") @@ -97,7 +97,7 @@ def _generate_queries(dbgym_cfg: DBGymConfig, seed_start: int, seed_end: int, sc verbose=False, ) queries_symlink_dpath = link_result(dbgym_cfg, real_dir) - assert os.path.samefile(queries_symlink_dpath, expected_queries_symlink_dpath) + assert queries_symlink_dpath.samefile(expected_queries_symlink_dpath) benchmark_tpch_logger.info( f"Generated queries: {data_path} [{seed_start}, {seed_end}]" ) @@ -119,7 +119,7 @@ def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float): subprocess_run(f"mv ./*.tbl {real_dir}", cwd=tpch_kit_dpath / "dbgen") tables_symlink_dpath = link_result(dbgym_cfg, real_dir) - assert os.path.samefile(tables_symlink_dpath, expected_tables_symlink_dpath) + assert tables_symlink_dpath.samefile(expected_tables_symlink_dpath) benchmark_tpch_logger.info(f"Generated: {expected_tables_symlink_dpath}") @@ -130,12 +130,12 @@ def _generate_workload( query_subset: str, scale_factor: float, ): - symlink_data_dir = dbgym_cfg.cur_symlinks_data_path(mkdir=True) + symlink_data_dpath = dbgym_cfg.cur_symlinks_data_path(mkdir=True) workload_name = workload_name_fn(scale_factor, seed_start, seed_end, query_subset) - expected_workload_symlink_dpath = symlink_data_dir / (workload_name + ".link") + expected_workload_symlink_dpath = symlink_data_dpath / (workload_name + ".link") benchmark_tpch_logger.info(f"Generating: {expected_workload_symlink_dpath}") - real_dir = dbgym_cfg.cur_task_runs_data_path( + real_dpath = dbgym_cfg.cur_task_runs_data_path( workload_name, mkdir=True ) @@ -147,15 +147,15 @@ def _generate_workload( elif query_subset == "odd": queries = [f"{i}" for i in range(1, 22 + 1) if i % 2 == 1] - with open(real_dir / "order.txt", "w") as f: + with open(real_dpath / "order.txt", "w") as f: for seed in range(seed_start, seed_end + 1): for qnum in queries: - sql_fpath = (symlink_data_dir / (_get_queries_dname(seed, scale_factor) + ".link")).resolve() / f"{qnum}.sql" + sql_fpath = (symlink_data_dpath / (_get_queries_dname(seed, scale_factor) + ".link")).resolve() / f"{qnum}.sql" assert sql_fpath.exists() and not sql_fpath.is_symlink() and sql_fpath.is_absolute(), "We should only write existent real absolute paths to a file" output = ",".join([f"S{seed}-Q{qnum}", str(sql_fpath)]) print(output, file=f) # TODO(WAN): add option to deep-copy the workload. - workload_symlink_dpath = link_result(dbgym_cfg, real_dir) + workload_symlink_dpath = link_result(dbgym_cfg, real_dpath) assert workload_symlink_dpath == expected_workload_symlink_dpath benchmark_tpch_logger.info(f"Generated: {expected_workload_symlink_dpath}") diff --git a/benchmark/tpch/tpch_setup.sh b/benchmark/tpch/tpch_setup.sh index 8f5c2478..011a7523 100755 --- a/benchmark/tpch/tpch_setup.sh +++ b/benchmark/tpch/tpch_setup.sh @@ -7,7 +7,7 @@ TPCH_REPO_ROOT="$1" if [ ! -d "${TPCH_REPO_ROOT}/tpch-kit" ]; then mkdir -p "${TPCH_REPO_ROOT}" cd "${TPCH_REPO_ROOT}" - git clone git@github.com:lmwnshn/tpch-kit.git --single-branch --branch master --depth 1 + git clone https://github.com/lmwnshn/tpch-kit.git --single-branch --branch master --depth 1 cd ./tpch-kit/dbgen make MACHINE=LINUX DATABASE=POSTGRESQL fi diff --git a/config.yaml b/dbgym_config.yaml similarity index 100% rename from config.yaml rename to dbgym_config.yaml diff --git a/dbms/postgres/build_repo.sh b/dbms/postgres/build_repo.sh index 271f7056..2127c438 100755 --- a/dbms/postgres/build_repo.sh +++ b/dbms/postgres/build_repo.sh @@ -7,7 +7,7 @@ REPO_REAL_PARENT_DPATH="$1" # Download and make postgres from the boot repository. mkdir -p "${REPO_REAL_PARENT_DPATH}" cd "${REPO_REAL_PARENT_DPATH}" -git clone git@github.com:lmwnshn/boot.git --single-branch --branch vldb_2024 --depth 1 +git clone https://github.com/lmwnshn/boot.git --single-branch --branch vldb_2024 --depth 1 cd ./boot ./cmudb/build/configure.sh release "${REPO_REAL_PARENT_DPATH}/boot/build/postgres" make clean @@ -25,7 +25,7 @@ make install -j cd "${REPO_REAL_PARENT_DPATH}/boot" # Download and make hypopg. -git clone git@github.com:HypoPG/hypopg.git +git clone https://github.com/HypoPG/hypopg.git cd ./hypopg PG_CONFIG="${REPO_REAL_PARENT_DPATH}/boot/build/postgres/bin/pg_config" make install cd "${REPO_REAL_PARENT_DPATH}/boot" diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index f81a877f..22789140 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -104,7 +104,7 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild): # only link at the end so that the link only ever points to a complete repo repo_symlink_dpath = link_result(dbgym_cfg, repo_real_dpath) - assert os.path.samefile(expected_repo_symlink_dpath, repo_symlink_dpath) + assert expected_repo_symlink_dpath.samefile(repo_symlink_dpath) dbms_postgres_logger.info(f"Set up repo in {expected_repo_symlink_dpath}") diff --git a/manage/cli.py b/manage/cli.py index d8c7e9b4..3ce8c65a 100644 --- a/manage/cli.py +++ b/manage/cli.py @@ -169,11 +169,11 @@ def clean_workspace(dbgym_cfg: DBGymConfig, mode: str="safe", verbose=False) -> if not is_child_path(real_fordpath, dbgym_cfg.dbgym_runs_path): continue - assert not os.path.samefile(real_fordpath, dbgym_cfg.dbgym_runs_path) + assert not real_fordpath.samefile(dbgym_cfg.dbgym_runs_path) # Figure out the task_run_child_fordpath to put into task_run_child_fordpaths_to_keep task_run_child_fordpath = None - if os.path.samefile(parent_dpath_of_path(real_fordpath), dbgym_cfg.dbgym_runs_path): + if parent_dpath_of_path(real_fordpath).samefile(dbgym_cfg.dbgym_runs_path): # While it's true that it shouldn't be possible to symlink to a directory directly in task_runs/, # we'll just not delete it if the user happens to have one like this. Even if the user messed up # the structure somehow, it's just a good idea not to delete it. @@ -183,10 +183,10 @@ def clean_workspace(dbgym_cfg: DBGymConfig, mode: str="safe", verbose=False) -> # However, as with above, we won't just nuke files if the workspace doesn't follow this rule for # some reason. task_run_child_fordpath = real_fordpath - while not os.path.samefile(parent_dpath_of_path(task_run_child_fordpath), dbgym_cfg.dbgym_runs_path): + while not parent_dpath_of_path(task_run_child_fordpath).samefile(dbgym_cfg.dbgym_runs_path): task_run_child_fordpath = parent_dpath_of_path(task_run_child_fordpath) assert task_run_child_fordpath != None - assert os.path.samefile(parent_dpath_of_path(task_run_child_fordpath), dbgym_cfg.dbgym_runs_path), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_cfg.dbgym_runs_path" + assert parent_dpath_of_path(task_run_child_fordpath).samefile(dbgym_cfg.dbgym_runs_path), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_cfg.dbgym_runs_path" task_run_child_fordpaths_to_keep.add(task_run_child_fordpath) # If on safe mode, add symlinks inside the task_run_child_fordpath to be processed diff --git a/misc/utils.py b/misc/utils.py index 380cafca..24562494 100644 --- a/misc/utils.py +++ b/misc/utils.py @@ -168,19 +168,19 @@ class DBGymConfig: """ Global configurations that apply to all parts of DB-Gym """ + num_times_created_this_run: int = 0 + + def __init__(self, dbgym_config_path: Path): + # The logic around dbgym_tmp_path assumes that DBGymConfig is only constructed once. + DBGymConfig.num_times_created_this_run += 1 + assert DBGymConfig.num_times_created_this_run == 1, f"DBGymConfig has been created {DBGymConfig.num_times_created_this_run} times. It should only be created once per run." - def __init__(self, config_path): - """ - Parameters - ---------- - config_path : Path - """ assert is_base_git_dir( os.getcwd() ), "This script should be invoked from the root of the dbgym repo." # Parse the YAML file. - contents: str = Path(config_path).read_text() + contents: str = dbgym_config_path.read_text() yaml_config: dict = yaml.safe_load(contents) # Require dbgym_workspace_path to be absolute. @@ -189,7 +189,7 @@ def __init__(self, config_path): Path(yaml_config["dbgym_workspace_path"]).resolve().absolute() ) - self.path: Path = config_path + self.path: Path = dbgym_config_path self.cur_path_list: list[str] = ["dbgym"] self.root_yaml: dict = yaml_config self.cur_yaml: dict = self.root_yaml @@ -204,11 +204,14 @@ def __init__(self, config_path): self.dbgym_workspace_path ) self.dbgym_symlinks_path.mkdir(parents=True, exist_ok=True) - # tmp is a workspace for this run only - # one use for it is to place the unzipped dbdata - # there's no need to save the actual dbdata dir in run_*/ because we just save a symlink to - # the .tgz file we unzipped + # tmp/ is a workspace for this run only + # One use for it is to place the unzipped dbdata. + # There's no need to save the actual dbdata dir in run_*/ because we just save a symlink to + # the .tgz file we unzipped. self.dbgym_tmp_path = get_tmp_path_from_workspace_path(self.dbgym_workspace_path) + # The best place to delete the old dbgym_tmp_path is in DBGymConfig.__init__(). + # This is better than deleting the dbgym_tmp_path is in DBGymConfig.__del__() because DBGymConfig may get deleted before execution has completed. + # Also, by keeping the tmp directory around, you can look at it to debug issues. if self.dbgym_tmp_path.exists(): shutil.rmtree(self.dbgym_tmp_path) self.dbgym_tmp_path.mkdir(parents=True, exist_ok=True) @@ -217,14 +220,10 @@ def __init__(self, config_path): self.dbgym_this_run_path = ( self.dbgym_runs_path / f"run_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" ) - # exist_ok is False because we don't want to override a previous task run's data. + # `exist_ok` is False because we don't want to override a previous task run's data. self.dbgym_this_run_path.mkdir(parents=True, exist_ok=False) - def __del__(self): - if self.dbgym_tmp_path.exists(): - shutil.rmtree(self.dbgym_tmp_path) - - # append_group() is used to mark the "codebase path" of an invocation of the CLI. The "codebase path" is + # `append_group()` is used to mark the "codebase path" of an invocation of the CLI. The "codebase path" is # explained further in the documentation. def append_group(self, name) -> None: self.cur_path_list.append(name) @@ -285,15 +284,15 @@ def conv_inputpath_to_realabspath(dbgym_cfg: DBGymConfig, inputpath: os.PathLike It *does not* check whether the path exists, since the user might be wanting to create a new file/dir Raises RuntimeError for errors """ - # for simplicity we only process Path objects + # For simplicity, we only process Path objects. realabspath = Path(inputpath) - # expanduser() is always "ok" to call first + # `expanduser()` is always "ok" to call first. realabspath = realabspath.expanduser() - # the reason we don't call Path.absolute() is because the path should be relative to dbgym_cfg.dbgym_repo_path, - # which is not necessary where cwd() points at the time of calling this function + # The reason we don't call Path.absolute() is because the path should be relative to dbgym_cfg.dbgym_repo_path, + # which is not necessary where cwd() points at the time of calling this function. if not realabspath.is_absolute(): realabspath = dbgym_cfg.dbgym_repo_path / realabspath - # resolve has two uses: normalize the path (remove ..) and resolve symlinks + # `resolve()` has two uses: normalize the path (remove ..) and resolve symlinks. # I believe the pathlib library (https://docs.python.org/3/library/pathlib.html#pathlib.Path.resolve) does it this # way to avoid an edge case related to symlinks and normalizing paths (footnote 1 of the linked docs) realabspath = realabspath.resolve() @@ -320,7 +319,7 @@ def is_fully_resolved(path: Path) -> bool: assert isinstance(path, Path) resolved_path = path.resolve() # Converting them to strings is the most unambiguously strict way of checking equality. - # Stuff like Path.__eq__() or os.path.samefile() might be more lenient. + # Stuff like Path.__eq__() or Path.samefile() might be more lenient. return str(resolved_path) == str(path) @@ -362,6 +361,7 @@ def basename_of_path(dpath: Path) -> str: return dpath_basename +# TODO(phw2): refactor to use Path def is_child_path(child_path: os.PathLike, parent_dpath: os.PathLike) -> bool: """ Checks whether child_path refers to a file/dir/link that is a child of the dir referred to by parent_dpath @@ -394,7 +394,8 @@ def open_and_save(dbgym_cfg: DBGymConfig, open_fpath: Path, mode="r"): - If you open two "config" files of the same name but different paths, only the first open will be saved. - Opening two "dependency" files of the same name but different paths will lead to two different "base dirs" being symlinked. """ - # process/validate open_fpath + # validate open_fpath + assert isinstance(open_fpath, Path) assert is_fully_resolved( open_fpath ), f"open_and_save(): open_fpath ({open_fpath}) should be a fully resolved path" @@ -416,21 +417,23 @@ def extract_from_task_run_fordpath(dbgym_cfg: DBGymConfig, task_run_fordpath: Pa The task_runs/ folder is organized like task_runs/run_*/[codebase]/[org]/any/path/you/want. This function extracts the [codebase] and [org] components """ + assert isinstance(task_run_fordpath, Path) assert not task_run_fordpath.is_symlink() - parent_dpath = os.path.dirname(task_run_fordpath) - assert not os.path.samefile( - parent_dpath, dbgym_cfg.dbgym_runs_path + parent_dpath = task_run_fordpath.parent + # TODO(phw2): make this a common function + assert not parent_dpath.samefile( + dbgym_cfg.dbgym_runs_path ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/ dir instead of directly in dbgym_cfg.dbgym_runs_path ({dbgym_cfg.dbgym_runs_path})" - assert not os.path.samefile( - parent_dpath_of_path(parent_dpath), dbgym_cfg.dbgym_runs_path + assert not parent_dpath_of_path(parent_dpath).samefile( + dbgym_cfg.dbgym_runs_path ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/ dir instead of directly in run_*/ ({dbgym_cfg.dbgym_runs_path})" - assert not os.path.samefile( - parent_dpath_of_path(parent_dpath_of_path(parent_dpath)), dbgym_cfg.dbgym_runs_path + assert not parent_dpath_of_path(parent_dpath_of_path(parent_dpath)).samefile( + dbgym_cfg.dbgym_runs_path ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/[organization]/ dir instead of directly in run_*/ ({dbgym_cfg.dbgym_runs_path})" # org_dpath is the run_*/[codebase]/[organization]/ dir that task_run_fordpath is in org_dpath = parent_dpath - while not os.path.samefile( - parent_dpath_of_path(parent_dpath_of_path(parent_dpath_of_path(org_dpath))), dbgym_cfg.dbgym_runs_path + while not parent_dpath_of_path(parent_dpath_of_path(parent_dpath_of_path(org_dpath))).samefile( + dbgym_cfg.dbgym_runs_path ): org_dpath = parent_dpath_of_path(org_dpath) org_dname = basename_of_path(org_dpath) @@ -440,6 +443,7 @@ def extract_from_task_run_fordpath(dbgym_cfg: DBGymConfig, task_run_fordpath: Pa return codebase_dpath, codebase_dname, org_dpath, org_dname +# TODO(phw2): really look at the clean PR to see what it changed # TODO(phw2): after merging agent-train, refactor some code in agent-train to use save_file() instead of open_and_save() def save_file(dbgym_cfg: DBGymConfig, fpath: Path) -> Path: """ @@ -451,12 +455,8 @@ def save_file(dbgym_cfg: DBGymConfig, fpath: Path) -> Path: We create a symlink if it is a "dependency", meaning a task.py command was run to generate it In these cases we create a symlink so we have full provenance for how the dependency was created """ - # process fpath and ensure that it's a file at the end - fpath = conv_inputpath_to_realabspath(dbgym_cfg, fpath) - fpath = os.path.realpath(fpath) # traverse symlinks - assert is_fully_resolved( - fpath - ), f"fpath ({fpath}) should be a fully resolved path" + # validate fpath + assert isinstance(fpath, Path) assert not os.path.islink(fpath), f"fpath ({fpath}) should not be a symlink" assert os.path.exists(fpath), f"fpath ({fpath}) does not exist" assert os.path.isfile(fpath), f"fpath ({fpath}) is not a file" @@ -470,34 +470,14 @@ def save_file(dbgym_cfg: DBGymConfig, fpath: Path) -> Path: # 2. files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them if is_child_path(fpath, dbgym_cfg.dbgym_runs_path): # get paths we'll need later. - parent_dpath = os.path.dirname(fpath) - assert not os.path.samefile( - parent_dpath, dbgym_cfg.dbgym_runs_path - ), f"fpath ({fpath}) should be inside a run_*/ dir instead of directly in dbgym_cfg.dbgym_runs_path ({dbgym_cfg.dbgym_runs_path})" - assert not os.path.samefile( - parent_dpath_of_path(parent_dpath), dbgym_cfg.dbgym_runs_path - ), f"fpath ({fpath}) should be inside a run_*/[codebase]/ dir instead of directly in run_*/ ({dbgym_cfg.dbgym_runs_path})" - assert not os.path.samefile( - parent_dpath_of_path(parent_dpath_of_path(parent_dpath)), dbgym_cfg.dbgym_runs_path - ), f"fpath ({fpath}) should be inside a run_*/[codebase]/[organization]/ dir instead of directly in run_*/ ({dbgym_cfg.dbgym_runs_path})" - # org_dpath is the run_*/[codebase]/[organization]/ dir that fpath is in - org_dpath = parent_dpath - while not os.path.samefile( - parent_dpath_of_path(parent_dpath_of_path(parent_dpath_of_path(org_dpath))), dbgym_cfg.dbgym_runs_path - ): - org_dpath = parent_dpath_of_path(org_dpath) - org_dname = basename_of_path(org_dpath) - codebase_dpath = parent_dpath_of_path(org_dpath) - codebase_dname = basename_of_path(codebase_dpath) - this_run_save_dpath = os.path.join( - dbgym_cfg.dbgym_this_run_path, codebase_dname, org_dname - ) + _, codebase_dname, org_dpath, org_dname = extract_from_task_run_fordpath(dbgym_cfg, fpath) + this_run_save_dpath = dbgym_cfg.dbgym_this_run_path / codebase_dname / org_dname os.makedirs(this_run_save_dpath, exist_ok=True) # if the fpath file is directly in org_dpath, we symlink the file directly - parent_dpath = os.path.dirname(fpath) - if os.path.samefile(parent_dpath, org_dpath): - fname = os.path.basename(fpath) + parent_dpath = parent_dpath_of_path(fpath) + if parent_dpath.samefile(org_dpath): + fname = basename_of_path(fpath) symlink_fpath = this_run_save_dpath / (fname + ".link") try_create_symlink(fpath, symlink_fpath) # else, we know the fpath file is _not_ directly inside org_dpath dir @@ -506,19 +486,19 @@ def save_file(dbgym_cfg: DBGymConfig, fpath: Path) -> Path: else: # set base_dpath such that its parent is org_dpath base_dpath = parent_dpath - while not os.path.samefile(parent_dpath_of_path(base_dpath), org_dpath): + while not parent_dpath_of_path(base_dpath).samefile(org_dpath): base_dpath = parent_dpath_of_path(base_dpath) # create symlink open_base_dname = basename_of_path(base_dpath) - symlink_dpath = os.path.join(this_run_save_dpath, open_base_dname) + symlink_dpath = this_run_save_dpath / (open_base_dname + ".link") try_create_symlink(base_dpath, symlink_dpath) # if it wasn't generated by a run else: # since we don't know where the file is at all, the location is "unknown" and the org is "all" this_run_save_dpath = dbgym_cfg.dbgym_this_run_path / "unknown" / "all" os.makedirs(this_run_save_dpath, exist_ok=True) - fname = os.path.basename(fpath) + fname = basename_of_path(fpath) # in this case, we want to copy instead of symlinking since it might disappear in the future copy_fpath = this_run_save_dpath / fname shutil.copy(fpath, copy_fpath) @@ -536,25 +516,26 @@ def link_result(dbgym_cfg: DBGymConfig, result_fordpath: Path, custom_result_nam version of a file. This function will return the path to the symlink that was created. """ - assert is_fully_resolved(result_path), f"result_path ({result_path}) should be a fully resolved path" - result_path = conv_inputpath_to_realabspath(dbgym_cfg, result_path) - assert is_child_path(result_path, dbgym_cfg.dbgym_this_run_path) - assert not os.path.islink(result_path) + assert isinstance(result_fordpath, Path) + assert is_fully_resolved(result_fordpath), f"result_fordpath ({result_fordpath}) should be a fully resolved path" + result_fordpath = conv_inputpath_to_realabspath(dbgym_cfg, result_fordpath) + assert is_child_path(result_fordpath, dbgym_cfg.dbgym_this_run_path) + assert not os.path.islink(result_fordpath) if custom_result_name != None: result_name = custom_result_name else: - if os.path.isfile(result_path): - result_name = os.path.basename(result_path) - elif os.path.isdir(result_path): - result_name = basename_of_path(result_path) + if os.path.isfile(result_fordpath): + result_name = basename_of_path(result_fordpath) + ".link" + elif os.path.isdir(result_fordpath): + result_name = basename_of_path(result_fordpath) + ".link" else: raise AssertionError("result_fordpath must be either a file or dir") # Figure out the parent directory path of the symlink codebase_dpath, codebase_dname, _, org_dname = extract_from_task_run_fordpath(dbgym_cfg, result_fordpath) # We're only supposed to save files generated by us, which means they should be in cur_task_runs_path() - assert os.path.samefile(codebase_dpath, dbgym_cfg.cur_task_runs_path()), f"link_result should only be called on files generated by this invocation to task.py" + assert codebase_dpath.samefile(dbgym_cfg.cur_task_runs_path()), f"link_result should only be called on files generated by this invocation to task.py" symlink_parent_dpath = dbgym_cfg.dbgym_symlinks_path / codebase_dname / org_dname symlink_parent_dpath.mkdir(parents=True, exist_ok=True) @@ -562,7 +543,7 @@ def link_result(dbgym_cfg: DBGymConfig, result_fordpath: Path, custom_result_nam # Note that in a multi-threaded setting, this might remove one created by a process in the same run, # meaning it's not "old" by our definition of "old". However, we'll always end up with a symlink # file of the current run regardless of the order of threads. - assert result_name.endswith(".link") and not result_name.endswith(".link.link"), "result_name ({result_name}) should end with \".link\"" + assert result_name.endswith(".link") and not result_name.endswith(".link.link"), f"result_name ({result_name}) should end with \".link\"" symlink_path = symlink_parent_dpath / result_name try_remove_file(symlink_path) try_create_symlink(result_fordpath, symlink_path) @@ -594,7 +575,7 @@ def try_remove_file(path: Path) -> None: pass -def restart_ray(redis_port: int): +def restart_ray(redis_port: int) -> None: """ Stop and start Ray. This is good to do between each stage to avoid bugs from carrying over across stages @@ -607,7 +588,7 @@ def restart_ray(redis_port: int): ) -def make_redis_started(port: int): +def make_redis_started(port: int) -> None: """ Start Redis if it's not already started. Note that Ray uses Redis but does *not* use this function. It starts Redis on its own. diff --git a/scripts/integration_test.sh b/scripts/integration_test.sh new file mode 100755 index 00000000..c63af9fa --- /dev/null +++ b/scripts/integration_test.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -euxo pipefail + +DBMS=postgres +BENCHMARK=tpch +SCALE_FACTOR=0.01 +AGENT=protox +INTENDED_DBDATA_HARDWARE="${1:-hdd}" + +export DBGYM_CONFIG_PATH=scripts/integtest_dbgym_config.yaml + +# Benchmark +python3 task.py benchmark $BENCHMARK data $SCALE_FACTOR +python3 task.py benchmark $BENCHMARK workload --scale-factor $SCALE_FACTOR + +# DBMS +python3 task.py dbms $DBMS build +python3 task.py dbms $DBMS dbdata tpch --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE + +# Tune +python3 task.py tune $AGENT embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE # long datagen so that train doesn't crash +python3 task.py tune $AGENT embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2 +python3 task.py tune $AGENT agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE +python3 task.py tune $AGENT agent tune tpch --scale-factor $SCALE_FACTOR +python3 task.py tune $AGENT agent replay tpch --scale-factor $SCALE_FACTOR diff --git a/scripts/integtest_dbgym_config.yaml b/scripts/integtest_dbgym_config.yaml new file mode 100644 index 00000000..14a1063a --- /dev/null +++ b/scripts/integtest_dbgym_config.yaml @@ -0,0 +1,3 @@ +dbgym_workspace_path: ../dbgym_integtest_workspace +boot_redis_port: 7379 +ray_gcs_port: 7380 \ No newline at end of file diff --git a/scripts/quickstart.sh b/scripts/quickstart.sh index 7d082726..70965793 100755 --- a/scripts/quickstart.sh +++ b/scripts/quickstart.sh @@ -18,6 +18,5 @@ python3 task.py dbms $DBMS dbdata tpch --scale-factor $SCALE_FACTOR # Tune python3 task.py tune $AGENT embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" # long datagen so that train doesn't crash python3 task.py tune $AGENT embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2 -python3 task.py tune $AGENT agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 --build-space-good-for-boot +python3 task.py tune $AGENT agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 python3 task.py tune $AGENT agent tune tpch --scale-factor $SCALE_FACTOR -python3 task.py tune $AGENT agent replay tpch --scale-factor $SCALE_FACTOR diff --git a/scripts/run_unittests.py b/scripts/run_unit_tests.py similarity index 100% rename from scripts/run_unittests.py rename to scripts/run_unit_tests.py diff --git a/task.py b/task.py index 31a5bd12..93b59b1e 100644 --- a/task.py +++ b/task.py @@ -1,4 +1,6 @@ import logging +import os +from pathlib import Path import click from misc.utils import DBGymConfig @@ -8,16 +10,17 @@ from tune.cli import tune_group from manage.cli import manage_group + # TODO(phw2): save commit, git diff, and run command # TODO(phw2): remove write permissions on old run_*/ dirs to enforce that they are immutable @click.group() -@click.option("--config-path", default="config.yaml") @click.pass_context -def task(ctx, config_path): +def task(ctx): """💩💩💩 CMU-DB Database Gym: github.com/cmu-db/dbgym 💩💩💩""" - ctx.obj = DBGymConfig(config_path) + dbgym_config_path = Path(os.getenv("DBGYM_CONFIG_PATH", "dbgym_config.yaml")) + ctx.obj = DBGymConfig(dbgym_config_path) if __name__ == "__main__": diff --git a/tune/protox/embedding/train_all.py b/tune/protox/embedding/train_all.py index 20d73292..f5bbd687 100644 --- a/tune/protox/embedding/train_all.py +++ b/tune/protox/embedding/train_all.py @@ -187,7 +187,7 @@ def train_all_embeddings( # Connect to cluster or die. restart_ray(dbgym_cfg.root_yaml["ray_gcs_port"]) - ray.init(address=f"localhost:{dbgym_cfg.root_yaml['boot_redis_port']}", log_to_driver=False) + ray.init(address=f"localhost:{dbgym_cfg.root_yaml['ray_gcs_port']}", log_to_driver=False) scheduler = FIFOScheduler() # type: ignore # Search.