MRG: provide default failed filenames based on CSV (#195)

Fixes #192 `--failed` defaults to `os.path.basename(csv_file).fail.txt`, and `--checksum-fail` defaults to `os.path.basename(csv_file).checksum_fail.txt`. --------- Co-authored-by: Tessa Pierce Ward <[email protected]>
sourmash-bio · Jan 28, 2025 · ba5d26d · ba5d26d
1 parent 37c65fd
commit ba5d26d
Show file tree

Hide file tree

Showing 3 changed files with 89 additions and 3 deletions.
diff --git a/src/python/sourmash_plugin_directsketch/__init__.py b/src/python/sourmash_plugin_directsketch/__init__.py
@@ -87,12 +87,10 @@ def __init__(self, p):
         p.add_argument(
             "--failed",
             help="CSV of failed accessions and download links (should be mostly protein).",
-            required=True,
         )
         p.add_argument(
             "--checksum-fail",
             help="CSV of accessions where the md5sum check failed or the md5sum file was improperly formatted or could not be downloaded.",
-            required=True,
         )
         p.add_argument(
             "-p",
@@ -177,6 +175,11 @@ def main(self, args):
 
         num_threads = set_thread_pool(args.cores)
 
+        if args.failed is None:
+            args.failed = os.path.basename(args.input_csv) + '.fail.csv'
+        if args.checksum_fail is None:
+            args.checksum_fail = os.path.basename(args.input_csv) + '.checksum_fail.csv'
+
         if args.n_simultaneous_downloads is None:
             if args.api_key:
                 notify("API key provided - setting --n-simultaneous-downloads to 9")
@@ -260,7 +263,6 @@ def __init__(self, p):
         p.add_argument(
             "--failed",
             help="CSV of failed accessions and download links.",
-            required=True,
         )
         # don't require checksum_fail here b/c users don't need to provide checksums
         p.add_argument(
@@ -333,6 +335,9 @@ def main(self, args):
 
         num_threads = set_thread_pool(args.cores)
 
+        if args.failed is None:
+            args.failed = os.path.basename(args.input_csv) + '.fail.csv'
+
         notify(
             f"Downloading and sketching all accessions in '{args.input_csv} using {args.n_simultaneous_downloads} simultaneous downloads, {args.retry_times} retries, and {num_threads} threads."
         )

diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py
@@ -76,6 +76,47 @@ def test_gbsketch_simple(runtmp, capfd):
         assert range == ""
 
 
+def test_gbsketch_simple_default_failed(runtmp, capfd):
+    # test the default value for --failed
+    acc_csv = get_test_data('acc.csv')
+    output = runtmp.output('simple.zip')
+    failed = runtmp.output('acc.csv.fail.csv')
+    ch_fail = runtmp.output('checksum_dl_failed.csv')
+
+    sig1 = get_test_data('GCA_000175535.1.sig.gz')
+    sig2 = get_test_data('GCA_000961135.2.sig.gz')
+    sig3 = get_test_data('GCA_000961135.2.protein.sig.gz')
+    ss1 = sourmash.load_one_signature(sig1, ksize=31)
+    ss2 = sourmash.load_one_signature(sig2, ksize=31)
+    # why does this need ksize =30 and not ksize = 10!???
+    ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein')
+
+    runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output,
+                    '-r', '3',
+                    '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200",
+                    in_dir=runtmp.output(''))
+
+    assert os.path.exists(output)
+    assert not runtmp.last_result.out # stdout should be empty
+    captured = capfd.readouterr()
+    print(captured.err)
+    print(f"looking for path: {output}")
+
+    assert os.path.exists(failed)
+    with open(failed, 'r') as failF:
+        fail_lines = failF.readlines()
+        print(fail_lines)
+        assert len(fail_lines) == 2
+        assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url,range\n"
+        acc, name, moltype, md5sum, download_filename, url, range = fail_lines[1].strip().split(',')
+        assert acc == "GCA_000175535.1"
+        assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14"
+        assert moltype == "protein"
+        assert download_filename == "GCA_000175535.1_protein.faa.gz"
+        assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz"
+        assert range == ""
+
+
 def test_gbsketch_manifest(runtmp, capfd):
     acc_csv = get_test_data('acc.csv')
     output = runtmp.output('simple.zip')

diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py
@@ -74,6 +74,46 @@ def test_urlsketch_simple(runtmp):
             assert range == ""
 
 
+def test_urlsketch_simple_default_failed(runtmp):
+    # check default value for --failed
+    acc_csv = get_test_data('acc-url.csv')
+    output = runtmp.output('simple.zip')
+    failed = runtmp.output('acc-url.csv.fail.csv')
+
+    sig1 = get_test_data('GCA_000175535.1.sig.gz')
+    sig2 = get_test_data('GCA_000961135.2.sig.gz')
+    sig3 = get_test_data('GCA_000961135.2.protein.sig.gz')
+    ss1 = sourmash.load_one_signature(sig1, ksize=31)
+    ss2 = sourmash.load_one_signature(sig2, ksize=31)
+    ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein')
+
+    runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output,
+                    '-r', '1',
+                    '--param-str', "dna,k=31,scaled=1000",
+                    '-p', "protein,k=10,scaled=200",
+                    in_dir=runtmp.output(''))
+
+    assert os.path.exists(output)
+    assert not runtmp.last_result.out # stdout should be empty
+
+    idx = sourmash.load_file_as_index(output)
+    sigs = list(idx.signatures())
+
+    assert os.path.exists(failed)
+    with open(failed, 'r') as failF:
+        header = next(failF).strip()
+        assert header == "accession,name,moltype,md5sum,download_filename,url,range"
+        for line in failF:
+            print(line)
+            acc, name, moltype, md5sum, download_filename, url, range = line.strip().split(',')
+            assert acc == "GCA_000175535.1"
+            assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14"
+            assert moltype == "protein"
+            assert download_filename == "GCA_000175535.1_protein.faa.gz"
+            assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz"
+            assert range == ""
+
+
 def test_urlsketch_manifest(runtmp, capfd):
     acc_csv = get_test_data('acc-url.csv')
     output = runtmp.output('simple.zip')