Move URLNotImplemented exception out to exceptions.py to prevent circular import and add new argument to make user deal with import worker disk size when streaming is not available. Fix a bug with file mutation as well.

stxue1 · stxue1 · commit 82fd09353bf4 · 2024-11-18T11:57:15.000-08:00
diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py
@@ -140,8 +140,8 @@
     NoSuchFileException,
     InvalidImportExportUrlException,
     LocatorException,
-    UnimplementedURLException,
 )
+from toil.lib.exceptions import UnimplementedURLException
 from toil.jobStores.fileJobStore import FileJobStore
 from toil.jobStores.utils import JobStoreUnavailableException, generate_locator
 from toil.lib.io import mkdtemp
@@ -3622,7 +3622,7 @@ def run(self, file_store: AbstractFileStore) -> Any:
         file_to_data = get_file_sizes(
             filenames, file_store.jobStore, include_remote_files=self.options.reference_inputs
         )
-        imports_job = ImportsJob(file_to_data, self.options.import_workers_threshold)
+        imports_job = ImportsJob(file_to_data, self.options.import_workers_threshold, self.options.import_workers_disk)
         self.addChild(imports_job)
         install_imports_job = CWLInstallImportsJob(
             initialized_job_order=self.initialized_job_order,
diff --git a/src/toil/job.py b/src/toil/job.py
@@ -74,17 +74,16 @@
 from toil.resource import ModuleDescriptor
 from toil.statsAndLogging import set_logging_from_options
 
+from toil.lib.exceptions import UnimplementedURLException
+
 if TYPE_CHECKING:
     from optparse import OptionParser
 
     from toil.batchSystems.abstractBatchSystem import (
         BatchJobExitReason
     )
     from toil.fileStores.abstractFileStore import AbstractFileStore
-    from toil.jobStores.abstractJobStore import (
-        AbstractJobStore,
-        UnimplementedURLException,
-    )
+    from toil.jobStores.abstractJobStore import AbstractJobStore
 
 logger = logging.getLogger(__name__)
 
@@ -3994,25 +3993,16 @@ def __init__(
         self,
         filenames: List[str],
         disk_size: Optional[ParseableIndivisibleResource] = None,
-        stream: bool = True,
-        **kwargs: Any,
+        **kwargs: Any
     ):
         """
         Setup importing files on a worker.
         :param filenames: List of file URIs to import
         :param disk_size: Designated disk space the worker can use when importing. Disregarded if stream is enabled.
-        :param stream: Whether to stream a file import or not. We don't have machinery to ensure
-        streaming, so if true, assume streaming works and don't give the worker a lot of disk space to work with.
-        If streaming fails, the worker will run out of resources and allocate a child job to handle the import with enough disk space.
         :param kwargs: args for the superclass
         """
-        if stream:
-            super().__init__(local=False, **kwargs)
-        else:
-            super().__init__(local=False, disk=disk_size, **kwargs)
         self.filenames = filenames
-        self.disk_size = disk_size
-        self.stream = stream
+        super().__init__(local=False, disk=disk_size, **kwargs)
 
     @staticmethod
     def import_files(
@@ -4045,21 +4035,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[Dict[str, FileID]]:
         Import the workflow inputs and then create and run the workflow.
         :return: Promise of workflow outputs
         """
-        try:
-            return self.import_files(self.filenames, file_store.jobStore)
-        except OSError as e:
-            # If the worker crashes due to running out of disk space and was not trying to
-            # stream the file import, then try a new import job without streaming by actually giving
-            # the worker enough disk space
-            # OSError 28 is no space left on device
-            if e.errno == 28 and self.stream is True:
-                non_streaming_import = WorkerImportJob(
-                    self.filenames, self.disk_size, stream=False
-                )
-                self.addChild(non_streaming_import)
-                return non_streaming_import.rv()
-            else:
-                raise
+        return self.import_files(self.filenames, file_store.jobStore)
 
 
 class ImportsJob(Job):
@@ -4073,6 +4049,7 @@ def __init__(
         self,
         file_to_data: Dict[str, FileMetadata],
         max_batch_size: ParseableIndivisibleResource,
+        import_worker_disk: ParseableIndivisibleResource,
         **kwargs: Any,
     ):
         """
@@ -4086,6 +4063,7 @@ def __init__(
         super().__init__(local=True, **kwargs)
         self._file_to_data = file_to_data
         self._max_batch_size = max_batch_size
+        self._import_worker_disk = import_worker_disk
 
     def run(
         self, file_store: AbstractFileStore
@@ -4130,8 +4108,7 @@ def run(
         # Create batch import jobs for each group of files
         for batch in file_batches:
             candidate_uris = [file_to_data[filename][0] for filename in batch]
-            batch_size = sum(file_to_data[filename][2] for filename in batch)
-            import_jobs.append(WorkerImportJob(candidate_uris, disk_size=batch_size))
+            import_jobs.append(WorkerImportJob(candidate_uris, disk_size=self._import_worker_disk))
 
         for job in import_jobs:
             self.addChild(job)
diff --git a/src/toil/jobStores/abstractJobStore.py b/src/toil/jobStores/abstractJobStore.py
@@ -47,6 +47,7 @@
     ServiceJobDescription,
 )
 from toil.lib.compatibility import deprecated
+from toil.lib.exceptions import UnimplementedURLException
 from toil.lib.io import WriteWatchingStream
 from toil.lib.memoize import memoize
 from toil.lib.retry import ErrorCondition, retry
@@ -83,23 +84,6 @@ def __init__(self, url: ParseResult) -> None:
         super().__init__("The URL '%s' is invalid." % url.geturl())
 
 
-class UnimplementedURLException(RuntimeError):
-    def __init__(self, url: ParseResult, operation: str) -> None:
-        """
-        Make a new exception to report that a URL scheme is not implemented, or
-        that the implementation can't be loaded because its dependencies are
-        not installed.
-
-        :param url: The given URL
-        :param operation: Whether we are trying to 'import' or 'export'
-        """
-        super().__init__(
-            f"No available job store implementation can {operation} the URL "
-            f"'{url.geturl()}'. Ensure Toil has been installed "
-            f"with the appropriate extras."
-        )
-
-
 class NoSuchJobException(Exception):
     """Indicates that the specified job does not exist."""
 
diff --git a/src/toil/lib/exceptions.py b/src/toil/lib/exceptions.py
@@ -15,6 +15,7 @@
 # 5.14.2018: copied into Toil from https://github.com/BD2KGenomics/bd2k-python-lib
 
 import sys
+from urllib.parse import ParseResult
 
 
 # TODO: isn't this built in to Python 3 now?
@@ -61,3 +62,20 @@ def raise_(exc_type, exc_value, traceback) -> None:
     if exc.__traceback__ is not traceback:
         raise exc.with_traceback(traceback)
     raise exc
+
+
+class UnimplementedURLException(RuntimeError):
+    def __init__(self, url: ParseResult, operation: str) -> None:
+        """
+        Make a new exception to report that a URL scheme is not implemented, or
+        that the implementation can't be loaded because its dependencies are
+        not installed.
+
+        :param url: The given URL
+        :param operation: Whether we are trying to 'import' or 'export'
+        """
+        super().__init__(
+            f"No available job store implementation can {operation} the URL "
+            f"'{url.geturl()}'. Ensure Toil has been installed "
+            f"with the appropriate extras."
+        )
diff --git a/src/toil/options/runner.py b/src/toil/options/runner.py
@@ -33,6 +33,18 @@ def add_runner_options(
         dest="import_workers_threshold",
         type=lambda x: human2bytes(str(x)),
         default="1 GiB",
-        help="Specify the file size threshold that determines how many files go into a batched import. As many files will go into a batch import job until this threshold"
+        help="Specify the file size threshold that determines how many files go into a batched import. As many files will go into a batch import job until this threshold "
              "is reached. This should be set in conjunction with the argument --runImportsOnWorkers."
     )
+    import_workers_disk_argument = ["--importWorkersDisk"]
+    if cwl:
+        import_workers_disk_argument.append("--import-workers-disk")
+    parser.add_argument(
+        *import_workers_disk_argument,
+        dest="import_workers_disk",
+        type=lambda x: human2bytes(str(x)),
+        default="1 MiB",
+        help="Specify the disk size each import worker will get. This may be necessary when file streaming is not possible. For example, downloading from AWS to a GCE "
+             "job store. If specified, this should be set to the largest file size of all files to import. This should be set in conjunction with the arguments "
+             "--runImportsOnWorkers and --importWorkersThreshold."
+    )
diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py
@@ -97,10 +97,10 @@
 )
 from toil.jobStores.abstractJobStore import (
     AbstractJobStore,
-    UnimplementedURLException,
     InvalidImportExportUrlException,
     LocatorException,
 )
+from toil.lib.exceptions import UnimplementedURLException
 from toil.lib.accelerators import get_individual_local_accelerators
 from toil.lib.conversions import VALID_PREFIXES, convert_units, human2bytes
 from toil.lib.io import mkdtemp, is_any_url, is_file_url, TOIL_URI_SCHEME, is_standard_url, is_toil_url, is_remote_url
@@ -1290,9 +1290,10 @@ def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File:
             file_id, task_path, dir_to_id[file_to_data[file.value][1]], file_basename
         )
 
-        setattr(file, "virtualized_value", toil_uri)
-        file.value = candidate_uri
-        return file
+        # Don't mutate the original file object
+        new_file = WDL.Value.File(file.value)
+        setattr(new_file, "virtualized_value", toil_uri)
+        return new_file
 
     return map_over_files_in_bindings(environment, convert_file_to_uri)
 
@@ -5326,6 +5327,7 @@ def __init__(
         inputs_search_path: list[str],
         import_remote_files: bool,
         import_workers_threshold: ParseableIndivisibleResource,
+        import_workers_disk: ParseableIndivisibleResource,
         **kwargs: Any,
     ):
         """
@@ -5339,6 +5341,7 @@ def __init__(
         self._inputs_search_path = inputs_search_path
         self._import_remote_files = import_remote_files
         self._import_workers_threshold = import_workers_threshold
+        self._import_workers_disk = import_workers_disk
 
     def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
         filenames = extract_workflow_inputs(self._inputs)
@@ -5347,8 +5350,9 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
             file_store.jobStore,
             self._inputs_search_path,
             include_remote_files=self._import_remote_files,
+            execution_dir=self._wdl_options.get("execution_dir")
         )
-        imports_job = ImportsJob(file_to_data, self._import_workers_threshold)
+        imports_job = ImportsJob(file_to_data, self._import_workers_threshold, self._import_workers_disk)
         self.addChild(imports_job)
         install_imports_job = WDLInstallImportsJob(
             self._target.name, self._inputs, imports_job.rv()
@@ -5381,6 +5385,7 @@ def make_root_job(
             inputs_search_path=inputs_search_path,
             import_remote_files=options.reference_inputs,
             import_workers_threshold=options.import_workers_threshold,
+            import_workers_disk=options.import_workers_disk
         )
     else:
         # Run WDL imports on leader