-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Automated workspace pruning/cleaning (#35)
**Summary**: A command to find and remove unneeded files from the workspace directory. **Demo**: Passing 25 unit tests with many edge cases which create files/directories, call `clean_workspace()`, and then verify its contents. ![Screenshot 2024-07-07 at 17 05 07](https://github.com/cmu-db/dbgym/assets/20631215/ed2edeae-062a-40ae-b38e-e2ad3718d6b3) **Details** * "Aggressive" mode removes all task_runs/\*/ directories that are not directly pointed to by a symlink in symlinks/. * "Safe" mode also keeps task_runs/\*/ directories which are indirectly pointed to by a symlink. This can happen if a symlink points to a task_runs/\*/ directory which has a symlink in it that points to another task_runs/*/ directory. * I chose to write so many unit tests because this operation must be bug-free.
- Loading branch information
1 parent
2f17bd4
commit 3245aab
Showing
251 changed files
with
1,030 additions
and
130 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
__pycache__/ | ||
.conda/ | ||
.idea/ | ||
test_clean_scratchspace/ | ||
|
||
workspace/ | ||
default_*_benchbase_config_*.xml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
#!/bin/bash | ||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,217 @@ | ||
import shutil | ||
from typing import List, Set | ||
import click | ||
import yaml | ||
import logging | ||
from pathlib import Path | ||
from misc.utils import DBGymConfig, is_child_path, parent_dpath_of_path | ||
from itertools import chain | ||
import os | ||
|
||
|
||
task_logger = logging.getLogger("task") | ||
task_logger.setLevel(logging.INFO) | ||
|
||
|
||
@click.group(name="manage") | ||
def manage_group(): | ||
pass | ||
|
||
|
||
@click.command(name="show") | ||
@click.argument("keys", nargs=-1) | ||
@click.pass_obj | ||
def manage_show(dbgym_cfg, keys): | ||
config_path = dbgym_cfg.path | ||
config_yaml = dbgym_cfg.yaml | ||
|
||
# Traverse the YAML. | ||
for key in keys: | ||
config_yaml = config_yaml[key] | ||
|
||
# Pretty-print the requested YAML value. | ||
output_str = None | ||
if type(config_yaml) != dict: | ||
output_str = config_yaml | ||
else: | ||
output_str = yaml.dump(config_yaml, default_flow_style=False) | ||
if len(keys) > 0: | ||
output_str = " " + output_str.replace("\n", "\n ") | ||
output_str = output_str.rstrip() | ||
print(output_str) | ||
|
||
task_logger.info(f"Read: {Path(config_path)}") | ||
|
||
|
||
@click.command(name="write") | ||
@click.argument("keys", nargs=-1) | ||
@click.argument("value_type") | ||
@click.argument("value") | ||
@click.pass_obj | ||
def manage_write(dbgym_cfg, keys, value_type, value): | ||
config_path = dbgym_cfg.path | ||
config_yaml = dbgym_cfg.yaml | ||
|
||
# Traverse the YAML. | ||
root_yaml = config_yaml | ||
for key in keys[:-1]: | ||
config_yaml = config_yaml[key] | ||
|
||
# Modify the requested YAML value and write the YAML file. | ||
assert type(config_yaml[keys[-1]]) != dict | ||
config_yaml[keys[-1]] = getattr(__builtins__, value_type)(value) | ||
new_yaml = yaml.dump(root_yaml, default_flow_style=False).rstrip() | ||
Path(config_path).write_text(new_yaml) | ||
|
||
task_logger.info(f"Updated: {Path(config_path)}") | ||
|
||
|
||
@click.command(name="standardize") | ||
@click.pass_obj | ||
def manage_standardize(dbgym_cfg): | ||
config_path = dbgym_cfg.path | ||
config_yaml = dbgym_cfg.yaml | ||
|
||
# Write the YAML file. | ||
new_yaml = yaml.dump(config_yaml, default_flow_style=False).rstrip() | ||
Path(config_path).write_text(new_yaml) | ||
|
||
task_logger.info(f"Updated: {Path(config_path)}") | ||
|
||
|
||
@click.command("clean") | ||
@click.pass_obj | ||
@click.option( | ||
"--mode", | ||
type=click.Choice(["safe", "aggressive"]), | ||
default="safe", | ||
help="The mode to clean the workspace (default=\"safe\"). \"aggressive\" means \"only keep run_*/ folders referenced by a file in symlinks/\". \"safe\" means \"in addition to that, recursively keep any run_*/ folders referenced by any symlinks in run_*/ folders we are keeping.\"" | ||
) | ||
def manage_clean(dbgym_cfg: DBGymConfig, mode: str): | ||
clean_workspace(dbgym_cfg, mode=mode, verbose=True) | ||
|
||
|
||
@click.command("count") | ||
@click.pass_obj | ||
def manage_count(dbgym_cfg: DBGymConfig): | ||
num_files = _count_files_in_workspace(dbgym_cfg) | ||
print(f"The workspace ({dbgym_cfg.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks.") | ||
|
||
|
||
def add_symlinks_in_dpath(symlinks_stack: List[Path], root_dpath: Path, processed_symlinks: Set[Path]) -> None: | ||
""" | ||
Will modify symlinks_stack and processed_symlinks. | ||
""" | ||
for root_pathstr, dir_names, file_names in os.walk(root_dpath): | ||
root_path = Path(root_pathstr) | ||
# symlinks can either be files or directories, so we go through both dir_names and file_names | ||
for file_name in chain(dir_names, file_names): | ||
file_path = root_path / file_name | ||
if file_path.is_symlink() and file_path not in processed_symlinks: | ||
symlinks_stack.append(file_path) | ||
processed_symlinks.add(file_path) | ||
|
||
|
||
def _count_files_in_workspace(dbgym_cfg: DBGymConfig) -> int: | ||
""" | ||
Counts the number of files (regular file or dir or symlink) in the workspace. | ||
""" | ||
total_count = 0 | ||
for dirpath, dirnames, filenames in os.walk(dbgym_cfg.dbgym_workspace_path, followlinks=False): | ||
# Check if any of the directories are symbolic links and remove them from dirnames | ||
dirnames[:] = [d for d in dirnames if not os.path.islink(os.path.join(dirpath, d))] | ||
|
||
# Count files and directories (non-symlink directories already filtered) | ||
total_count += len(filenames) + len(dirnames) | ||
|
||
return total_count | ||
|
||
|
||
def clean_workspace(dbgym_cfg: DBGymConfig, mode: str="safe", verbose=False) -> None: | ||
""" | ||
Clean all [workspace]/task_runs/run_*/ directories that are not referenced by any "active symlinks". | ||
If mode is "aggressive", "active symlinks" means *only* the symlinks directly in [workspace]/symlinks/. | ||
If mode is "safe", "active symlinks" means the symlinks directly in [workspace]/symlinks/ as well as | ||
any symlinks referenced in task_runs/run_*/ directories we have already decided to keep. | ||
""" | ||
# This stack holds the symlinks that are left to be processed | ||
symlink_fpaths_to_process = [] | ||
# This set holds the symlinks that have already been processed to avoid infinite loops | ||
processed_symlinks = set() | ||
|
||
# 1. Initialize paths to process | ||
if dbgym_cfg.dbgym_symlinks_path.exists(): | ||
add_symlinks_in_dpath(symlink_fpaths_to_process, dbgym_cfg.dbgym_symlinks_path, processed_symlinks) | ||
|
||
# 2. Go through symlinks, figuring out which "children of task runs" to keep | ||
# Based on the rules of the framework, "children of task runs" should be run_*/ directories. | ||
# However, the user's workspace might happen to break these rules by putting directories not | ||
# named "run_*/" or files directly in task_runs/. Thus, I use the term "task_run_child_fordpaths" | ||
# instead of "run_dpaths". | ||
task_run_child_fordpaths_to_keep = set() | ||
|
||
if dbgym_cfg.dbgym_runs_path.exists(): | ||
while symlink_fpaths_to_process: | ||
symlink_fpath: Path = symlink_fpaths_to_process.pop() | ||
assert symlink_fpath.is_symlink() | ||
# Path.resolve() resolves all layers of symlinks while os.readlink() only resolves one layer. | ||
# However, os.readlink() literally reads the string contents of the link. We need to do some | ||
# processing on the result of os.readlink() to convert it to an absolute path | ||
real_fordpath = symlink_fpath.resolve() | ||
one_layer_resolved_fordpath = os.readlink(symlink_fpath) | ||
assert str(real_fordpath) == str(os.readlink(symlink_fpath)), f"symlink_fpath ({symlink_fpath}) seems to point to *another* symlink. This is difficult to handle, so it is currently disallowed. Please resolve this situation manually." | ||
|
||
# If the file doesn't exist, we'll just ignore it. | ||
if not real_fordpath.exists(): | ||
continue | ||
# We're only trying to figure out which direct children of task_runs/ to save. If the file isn't | ||
# even a descendant, we don't care about it. | ||
if not is_child_path(real_fordpath, dbgym_cfg.dbgym_runs_path): | ||
continue | ||
|
||
assert not os.path.samefile(real_fordpath, dbgym_cfg.dbgym_runs_path) | ||
|
||
# Figure out the task_run_child_fordpath to put into task_run_child_fordpaths_to_keep | ||
task_run_child_fordpath = None | ||
if os.path.samefile(parent_dpath_of_path(real_fordpath), dbgym_cfg.dbgym_runs_path): | ||
# While it's true that it shouldn't be possible to symlink to a directory directly in task_runs/, | ||
# we'll just not delete it if the user happens to have one like this. Even if the user messed up | ||
# the structure somehow, it's just a good idea not to delete it. | ||
task_run_child_fordpath = real_fordpath | ||
else: | ||
# Technically, it's not allowed to symlink to any files not in task_runs/run_*/[codebase]/[organization]/. | ||
# However, as with above, we won't just nuke files if the workspace doesn't follow this rule for | ||
# some reason. | ||
task_run_child_fordpath = real_fordpath | ||
while not os.path.samefile(parent_dpath_of_path(task_run_child_fordpath), dbgym_cfg.dbgym_runs_path): | ||
task_run_child_fordpath = parent_dpath_of_path(task_run_child_fordpath) | ||
assert task_run_child_fordpath != None | ||
assert os.path.samefile(parent_dpath_of_path(task_run_child_fordpath), dbgym_cfg.dbgym_runs_path), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_cfg.dbgym_runs_path" | ||
task_run_child_fordpaths_to_keep.add(task_run_child_fordpath) | ||
|
||
# If on safe mode, add symlinks inside the task_run_child_fordpath to be processed | ||
if mode == "safe": | ||
add_symlinks_in_dpath(symlink_fpaths_to_process, task_run_child_fordpath, processed_symlinks) | ||
|
||
# 3. Go through all children of task_runs/*, deleting any that we weren't told to keep | ||
# It's true that symlinks might link outside of task_runs/*. We'll just not care about those | ||
starting_num_files = _count_files_in_workspace(dbgym_cfg) | ||
if dbgym_cfg.dbgym_runs_path.exists(): | ||
for child_fordpath in dbgym_cfg.dbgym_runs_path.iterdir(): | ||
if child_fordpath not in task_run_child_fordpaths_to_keep: | ||
if child_fordpath.is_dir(): | ||
shutil.rmtree(child_fordpath) | ||
else: | ||
os.remove(child_fordpath) | ||
ending_num_files = _count_files_in_workspace(dbgym_cfg) | ||
|
||
if verbose: | ||
task_logger.info(f"Removed {starting_num_files - ending_num_files} out of {starting_num_files} files") | ||
task_logger.info(f"Workspace went from {starting_num_files - ending_num_files} to {starting_num_files}") | ||
|
||
|
||
manage_group.add_command(manage_show) | ||
manage_group.add_command(manage_write) | ||
manage_group.add_command(manage_standardize) | ||
manage_group.add_command(manage_clean) | ||
manage_group.add_command(manage_count) |
Empty file.
Oops, something went wrong.