Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automated workspace pruning/cleaning #35

Merged
merged 32 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
642a77f
basic os.walk
wangpatrick57 Apr 3, 2024
6c93ae4
config group -> manage group
wangpatrick57 Apr 4, 2024
699d1f8
Merge branch 'main' into clean-workspace
wangpatrick57 Apr 4, 2024
ff122d0
renamed unit test folders
wangpatrick57 Apr 4, 2024
20aa6ed
create_structure function
wangpatrick57 Apr 4, 2024
7dbf90d
del scratchspace
wangpatrick57 Apr 4, 2024
5addc5f
wrote test_structure_helpers
wangpatrick57 Apr 5, 2024
8538f66
wrote basic clean_workspace
wangpatrick57 Apr 5, 2024
1a6b204
fixed create_structure to do symlinks from root_path
wangpatrick57 Apr 5, 2024
768cdb9
comments for more tests for clean
wangpatrick57 Apr 5, 2024
2e766f8
wrote empty tests
wangpatrick57 Apr 5, 2024
155def0
refactored parent_dir and dir_basename functions
wangpatrick57 Apr 5, 2024
366e288
merged with main
wangpatrick57 Apr 5, 2024
49d5e64
fixed linking to dirs
wangpatrick57 Apr 5, 2024
702afa1
fixed file in dir in runs
wangpatrick57 Apr 6, 2024
b42a095
link to link crashing now
wangpatrick57 Apr 6, 2024
e0b4743
added test for aggressive mode
wangpatrick57 Apr 6, 2024
35faba2
fixed verify structure to be fine with broken symlinks
wangpatrick57 Apr 6, 2024
059ffd2
tests for looping infinitely
wangpatrick57 Apr 10, 2024
8106f02
merge
wangpatrick57 Apr 25, 2024
64a555b
fixed infinite loop problem
wangpatrick57 Apr 25, 2024
2592db5
added some tests
wangpatrick57 Apr 26, 2024
2d41c9a
added more tests
wangpatrick57 Apr 26, 2024
bf8cd32
added all tests
wangpatrick57 Apr 26, 2024
a6805a8
wrote _count_files_in_workspace()
wangpatrick57 Apr 26, 2024
3dbeac0
comment
wangpatrick57 Apr 26, 2024
2a838cb
merged with main
wangpatrick57 Jul 8, 2024
3dd9923
small typo
wangpatrick57 Jul 8, 2024
5f112c6
Update test_clean.py
wangpatrick57 Jul 9, 2024
da3c0d2
Update test_clean.py
wangpatrick57 Jul 9, 2024
54d1e7e
removed duplicated test
wangpatrick57 Jul 9, 2024
92462e4
changed print to logger.info()
wangpatrick57 Jul 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
__pycache__/
.conda/
.idea/
test_clean_scratchspace/

workspace/
default_*_benchbase_config_*.xml
File renamed without changes.
File renamed without changes.
2 changes: 2 additions & 0 deletions dependencies/rust.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
File renamed without changes.
218 changes: 218 additions & 0 deletions manage/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
import shutil
from typing import List, Set
import click
import yaml
import logging
from pathlib import Path
from misc.utils import DBGymConfig, is_child_path, parent_dpath_of_path
from itertools import chain
import os

from misc.utils import get_symlinks_path_from_workspace_path

task_logger = logging.getLogger("task")
task_logger.setLevel(logging.INFO)


@click.group(name="manage")
def manage_group():
pass


@click.command(name="show")
@click.argument("keys", nargs=-1)
@click.pass_obj
def manage_show(dbgym_cfg, keys):
config_path = dbgym_cfg.path
config_yaml = dbgym_cfg.yaml

# Traverse the YAML.
for key in keys:
config_yaml = config_yaml[key]

# Pretty-print the requested YAML value.
output_str = None
if type(config_yaml) != dict:
output_str = config_yaml
else:
output_str = yaml.dump(config_yaml, default_flow_style=False)
if len(keys) > 0:
output_str = " " + output_str.replace("\n", "\n ")
output_str = output_str.rstrip()
print(output_str)

task_logger.info(f"Read: {Path(config_path)}")


@click.command(name="write")
@click.argument("keys", nargs=-1)
@click.argument("value_type")
@click.argument("value")
@click.pass_obj
def manage_write(dbgym_cfg, keys, value_type, value):
config_path = dbgym_cfg.path
config_yaml = dbgym_cfg.yaml

# Traverse the YAML.
root_yaml = config_yaml
for key in keys[:-1]:
config_yaml = config_yaml[key]

# Modify the requested YAML value and write the YAML file.
assert type(config_yaml[keys[-1]]) != dict
config_yaml[keys[-1]] = getattr(__builtins__, value_type)(value)
new_yaml = yaml.dump(root_yaml, default_flow_style=False).rstrip()
Path(config_path).write_text(new_yaml)

task_logger.info(f"Updated: {Path(config_path)}")


@click.command(name="standardize")
@click.pass_obj
def manage_standardize(dbgym_cfg):
config_path = dbgym_cfg.path
config_yaml = dbgym_cfg.yaml

# Write the YAML file.
new_yaml = yaml.dump(config_yaml, default_flow_style=False).rstrip()
Path(config_path).write_text(new_yaml)

task_logger.info(f"Updated: {Path(config_path)}")


@click.command("clean")
@click.pass_obj
@click.option(
"--mode",
type=click.Choice(["safe", "aggressive"]),
default="safe",
help="The mode to clean the workspace (default=\"safe\"). \"aggressive\" means \"only keep run_*/ folders referenced by a file in symlinks/\". \"safe\" means \"in addition to that, recursively keep any run_*/ folders referenced by any symlinks in run_*/ folders we are keeping.\""
)
def manage_clean(dbgym_cfg: DBGymConfig, mode: str):
clean_workspace(dbgym_cfg, mode=mode, verbose=True)


@click.command("count")
@click.pass_obj
def manage_count(dbgym_cfg: DBGymConfig):
num_files = _count_files_in_workspace(dbgym_cfg)
print(f"The workspace ({dbgym_cfg.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks.")


def add_symlinks_in_dpath(symlinks_stack: List[Path], root_dpath: Path, processed_symlinks: Set[Path]) -> None:
"""
Will modify symlinks_stack and processed_symlinks.
"""
for root_pathstr, dir_names, file_names in os.walk(root_dpath):
root_path = Path(root_pathstr)
# symlinks can either be files or directories, so we go through both dir_names and file_names
for file_name in chain(dir_names, file_names):
file_path = root_path / file_name
if file_path.is_symlink() and file_path not in processed_symlinks:
symlinks_stack.append(file_path)
processed_symlinks.add(file_path)


def _count_files_in_workspace(dbgym_cfg: DBGymConfig) -> int:
"""
Counts the number of files (regular file or dir or symlink) in the workspace.
"""
total_count = 0
for dirpath, dirnames, filenames in os.walk(dbgym_cfg.dbgym_workspace_path, followlinks=False):
# Check if any of the directories are symbolic links and remove them from dirnames
dirnames[:] = [d for d in dirnames if not os.path.islink(os.path.join(dirpath, d))]

# Count files and directories (non-symlink directories already filtered)
total_count += len(filenames) + len(dirnames)

return total_count


def clean_workspace(dbgym_cfg: DBGymConfig, mode: str="safe", verbose=False) -> None:
"""
Clean all [workspace]/task_runs/run_*/ directories that are not referenced by any "active symlinks".
If mode is "aggressive", "active symlinks" means *only* the symlinks directly in [workspace]/symlinks/.
If mode is "safe", "active symlinks" means the symlinks directly in [workspace]/symlinks/ as well as
any symlinks referenced in task_runs/run_*/ directories we have already decided to keep.
"""
# This stack holds the symlinks that are left to be processed
symlink_fpaths_to_process = []
# This set holds the symlinks that have already been processed to avoid infinite loops
processed_symlinks = set()

# 1. Initialize paths to process
if dbgym_cfg.dbgym_symlinks_path.exists():
add_symlinks_in_dpath(symlink_fpaths_to_process, dbgym_cfg.dbgym_symlinks_path, processed_symlinks)

# 2. Go through symlinks, figuring out which "children of task runs" to keep
# Based on the rules of the framework, "children of task runs" should be run_*/ directories.
# However, the user's workspace might happen to break these rules by putting directories not
# named "run_*/" or files directly in task_runs/. Thus, I use the term "task_run_child_fordpaths"
# instead of "run_dpaths".
task_run_child_fordpaths_to_keep = set()

if dbgym_cfg.dbgym_runs_path.exists():
while symlink_fpaths_to_process:
symlink_fpath: Path = symlink_fpaths_to_process.pop()
assert symlink_fpath.is_symlink()
# Path.resolve() resolves all layers of symlinks while os.readlink() only resolves one layer.
# However, os.readlink() literally reads the string contents of the link. We need to do some
# processing on the result of os.readlink() to convert it to an absolute path
real_fordpath = symlink_fpath.resolve()
one_layer_resolved_fordpath = os.readlink(symlink_fpath)
assert str(real_fordpath) == str(os.readlink(symlink_fpath)), f"symlink_fpath ({symlink_fpath}) seems to point to *another* symlink. This is difficult to handle, so it is currently disallowed. Please resolve this situation manually."

# If the file doesn't exist, we'll just ignore it.
if not real_fordpath.exists():
continue
# We're only trying to figure out which direct children of task_runs/ to save. If the file isn't
# even a descendant, we don't care about it.
if not is_child_path(real_fordpath, dbgym_cfg.dbgym_runs_path):
continue

assert not os.path.samefile(real_fordpath, dbgym_cfg.dbgym_runs_path)

# Figure out the task_run_child_fordpath to put into task_run_child_fordpaths_to_keep
task_run_child_fordpath = None
if os.path.samefile(parent_dpath_of_path(real_fordpath), dbgym_cfg.dbgym_runs_path):
# While it's true that it shouldn't be possible to symlink to a directory directly in task_runs/,
# we'll just not delete it if the user happens to have one like this. Even if the user messed up
# the structure somehow, it's just a good idea not to delete it.
task_run_child_fordpath = real_fordpath
else:
# Technically, it's not allowed to symlink to any files not in task_runs/run_*/[codebase]/[organization]/.
# However, as with above, we won't just nuke files if the workspace doesn't follow this rule for
# some reason.
task_run_child_fordpath = real_fordpath
while not os.path.samefile(parent_dpath_of_path(task_run_child_fordpath), dbgym_cfg.dbgym_runs_path):
task_run_child_fordpath = parent_dpath_of_path(task_run_child_fordpath)
assert task_run_child_fordpath != None
assert os.path.samefile(parent_dpath_of_path(task_run_child_fordpath), dbgym_cfg.dbgym_runs_path), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_cfg.dbgym_runs_path"
task_run_child_fordpaths_to_keep.add(task_run_child_fordpath)

# If on safe mode, add symlinks inside the task_run_child_fordpath to be processed
if mode == "safe":
add_symlinks_in_dpath(symlink_fpaths_to_process, task_run_child_fordpath, processed_symlinks)

# 3. Go through all children of task_runs/*, deleting any that we weren't told to keep
# It's true that symlinks might link outside of task_runs/*. We'll just not care about those
starting_num_files = _count_files_in_workspace(dbgym_cfg)
if dbgym_cfg.dbgym_runs_path.exists():
for child_fordpath in dbgym_cfg.dbgym_runs_path.iterdir():
if child_fordpath not in task_run_child_fordpaths_to_keep:
if child_fordpath.is_dir():
shutil.rmtree(child_fordpath)
else:
os.remove(child_fordpath)
ending_num_files = _count_files_in_workspace(dbgym_cfg)

if verbose:
print(f"Removed {starting_num_files - ending_num_files} out of {starting_num_files} files")
wangpatrick57 marked this conversation as resolved.
Show resolved Hide resolved
print(f"Workspace went from {starting_num_files - ending_num_files} to {starting_num_files}")


manage_group.add_command(manage_show)
manage_group.add_command(manage_write)
manage_group.add_command(manage_standardize)
manage_group.add_command(manage_clean)
manage_group.add_command(manage_count)
Empty file added manage/tests/__init__.py
Empty file.
Loading
Loading