Skip to content

Commit

Permalink
Virtualize filenames as in-container paths from point of view of WDL …
Browse files Browse the repository at this point in the history
…command (#4527)

* Present container FS as the virtual FS in commands

* Explain the dueling virtualization schemes

* Satisfy MyPy

* Fix typo

---------

Co-authored-by: Lon Blauvelt <[email protected]>
  • Loading branch information
adamnovak and DailyDreaming authored Jul 25, 2023
1 parent cc36e30 commit 7cf8c33
Showing 1 changed file with 88 additions and 9 deletions.
97 changes: 88 additions & 9 deletions src/toil/wdl/wdltoil.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,32 @@ def _call_eager(self, expr: "WDL.Expr.Apply", arguments: List[WDL.Value.Base]) -
# Return the result as a WDL float value
return WDL.Value.Float(total_size)


# Both the WDL code itself **and** the commands that it runs will deal in
# "virtualized" filenames.

# We have to guarantee that "When a WDL author uses a File input in their
# Command Section, the fully qualified, localized path to the file is
# substituted when that declaration is referenced in the command template."

# This has to be true even if the File is the result of a WDL function that is
# run *during* the evaluation of the command string, via a placeholder
# expression evaluation.

# Really there are 3 filename spaces in play: Toil filestore URLs,
# outside-the-container host filenames, and inside-the-container filenames. But
# the MiniWDL machinery only gives us 2 levels to work with: "virtualized"
# (visible to the workflow) and "devirtualized" (openable by this process).

# So we sneakily swap out what "virtualized" means. Usually (as provided by
# ToilWDLStdLibBase) a "virtualized" filename is the Toil filestore URL space.
# But when evaluating a task command, we switch things so that the
# "virtualized" space is the inside-the-container filename space (by
# devirtualizing and then host-to-container-mapping all the visible files, and
# then using ToilWDLStdLibTaskCommand for evaluating expressions, and then
# going back from container to host space after the command). At all times the
# "devirtualized" space is outside-the-container host filenames.

class ToilWDLStdLibBase(WDL.StdLib.Base):
"""
Standard library implementation for WDL as run on Toil.
Expand All @@ -398,14 +424,7 @@ def __init__(self, file_store: AbstractFileStore):
self.size = NonDownloadingSize(self)

# Keep the file store around so we can access files.
self._file_store = file_store

# Both the WDL code itself **and** the commands that it runs will deal in
# "virtualized" filenames by default, so when making commands we need to
# make sure to devirtualize filenames.
# We have to guarantee that "When a WDL author uses a File input in their
# Command Section, the fully qualified, localized path to the file is
# substituted when that declaration is referenced in the command template."
self._file_store = file_store

def _is_url(self, filename: str, schemes: List[str] = ['http:', 'https:', 's3:', 'gs:', TOIL_URI_SCHEME]) -> bool:
"""
Expand Down Expand Up @@ -470,6 +489,63 @@ def _virtualize_filename(self, filename: str) -> str:
logger.debug('Virtualized %s as WDL file %s', filename, result)
return result

class ToilWDLStdLibTaskCommand(ToilWDLStdLibBase):
"""
Standard library implementation to use inside a WDL task command evaluation.
Expects all the filenames in variable bindings to be container-side paths;
these are the "virtualized" filenames, while the "devirtualized" filenames
are host-side paths.
"""

def __init__(self, file_store: AbstractFileStore, container: TaskContainer):
"""
Set up the standard library for the task command section.
"""

# TODO: Don't we want to make sure we don't actually use the file store?
super().__init__(file_store)
self.container = container

def _devirtualize_filename(self, filename: str) -> str:
"""
Go from a virtualized WDL-side filename to a local disk filename.
Any WDL-side filenames which are paths will be paths in the container.
"""
if self._is_url(filename):
# We shouldn't have to deal with URLs here; we want to have exactly
# two nicely stacked/back-to-back layers of virtualization, joined
# on the out-of-container paths.
raise RuntimeError(f"File {filename} is a URL but should already be an in-container-virtualized filename")

# If this is a local path it will be in the container. Make sure we
# use the out-of-container equivalent.
result = self.container.host_path(filename)

if result is None:
# We really shouldn't have files in here that we didn't virtualize.
raise RuntimeError(f"File {filename} in container is not mounted from the host and can't be opened from the host")

logger.debug('Devirtualized %s as out-of-container file %s', filename, result)
return result


def _virtualize_filename(self, filename: str) -> str:
"""
From a local path in write_dir, 'virtualize' into the filename as it should present in a
File value, when substituted into a command in the container.
"""

if filename not in self.container.input_path_map:
# Mount the file.
self.container.add_paths([filename])

result = self.container.input_path_map[filename]

logger.debug('Virtualized %s as WDL file %s', filename, result)
return result

class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
"""
Standard library implementation for WDL as run on Toil, with additional
Expand Down Expand Up @@ -1218,8 +1294,11 @@ def patched_run_invocation(*args: Any, **kwargs: Any) -> List[str]:
# TODO: MiniWDL deals with directory paths specially here.
contained_bindings = map_over_files_in_bindings(bindings, lambda path: task_container.input_path_map[path])

# Make a new standard library for evaluating the command specifically, which only deals with in-container paths and out-of-container paths.
command_library = ToilWDLStdLibTaskCommand(file_store, task_container)

# Work out the command string, and unwrap it
command_string: str = evaluate_named_expression(self._task, "command", WDL.Type.String(), self._task.command, contained_bindings, standard_library).coerce(WDL.Type.String()).value
command_string: str = evaluate_named_expression(self._task, "command", WDL.Type.String(), self._task.command, contained_bindings, command_library).coerce(WDL.Type.String()).value

# Grab the standard out and error paths. MyPy complains if we call
# them because in the current MiniWDL version they are untyped.
Expand Down

0 comments on commit 7cf8c33

Please sign in to comment.