Skip to content

Commit

Permalink
MRG: provide default failed filenames based on CSV (#195)
Browse files Browse the repository at this point in the history
Fixes
#192

`--failed` defaults to `os.path.basename(csv_file).fail.txt`, and
`--checksum-fail` defaults to
`os.path.basename(csv_file).checksum_fail.txt`.

---------

Co-authored-by: Tessa Pierce Ward <[email protected]>
  • Loading branch information
ctb and bluegenes authored Jan 28, 2025
1 parent 37c65fd commit ba5d26d
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 3 deletions.
11 changes: 8 additions & 3 deletions src/python/sourmash_plugin_directsketch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,10 @@ def __init__(self, p):
p.add_argument(
"--failed",
help="CSV of failed accessions and download links (should be mostly protein).",
required=True,
)
p.add_argument(
"--checksum-fail",
help="CSV of accessions where the md5sum check failed or the md5sum file was improperly formatted or could not be downloaded.",
required=True,
)
p.add_argument(
"-p",
Expand Down Expand Up @@ -177,6 +175,11 @@ def main(self, args):

num_threads = set_thread_pool(args.cores)

if args.failed is None:
args.failed = os.path.basename(args.input_csv) + '.fail.csv'
if args.checksum_fail is None:
args.checksum_fail = os.path.basename(args.input_csv) + '.checksum_fail.csv'

if args.n_simultaneous_downloads is None:
if args.api_key:
notify("API key provided - setting --n-simultaneous-downloads to 9")
Expand Down Expand Up @@ -260,7 +263,6 @@ def __init__(self, p):
p.add_argument(
"--failed",
help="CSV of failed accessions and download links.",
required=True,
)
# don't require checksum_fail here b/c users don't need to provide checksums
p.add_argument(
Expand Down Expand Up @@ -333,6 +335,9 @@ def main(self, args):

num_threads = set_thread_pool(args.cores)

if args.failed is None:
args.failed = os.path.basename(args.input_csv) + '.fail.csv'

notify(
f"Downloading and sketching all accessions in '{args.input_csv} using {args.n_simultaneous_downloads} simultaneous downloads, {args.retry_times} retries, and {num_threads} threads."
)
Expand Down
41 changes: 41 additions & 0 deletions tests/test_gbsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,47 @@ def test_gbsketch_simple(runtmp, capfd):
assert range == ""


def test_gbsketch_simple_default_failed(runtmp, capfd):
# test the default value for --failed
acc_csv = get_test_data('acc.csv')
output = runtmp.output('simple.zip')
failed = runtmp.output('acc.csv.fail.csv')
ch_fail = runtmp.output('checksum_dl_failed.csv')

sig1 = get_test_data('GCA_000175535.1.sig.gz')
sig2 = get_test_data('GCA_000961135.2.sig.gz')
sig3 = get_test_data('GCA_000961135.2.protein.sig.gz')
ss1 = sourmash.load_one_signature(sig1, ksize=31)
ss2 = sourmash.load_one_signature(sig2, ksize=31)
# why does this need ksize =30 and not ksize = 10!???
ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein')

runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output,
'-r', '3',
'--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200",
in_dir=runtmp.output(''))

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty
captured = capfd.readouterr()
print(captured.err)
print(f"looking for path: {output}")

assert os.path.exists(failed)
with open(failed, 'r') as failF:
fail_lines = failF.readlines()
print(fail_lines)
assert len(fail_lines) == 2
assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url,range\n"
acc, name, moltype, md5sum, download_filename, url, range = fail_lines[1].strip().split(',')
assert acc == "GCA_000175535.1"
assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14"
assert moltype == "protein"
assert download_filename == "GCA_000175535.1_protein.faa.gz"
assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz"
assert range == ""


def test_gbsketch_manifest(runtmp, capfd):
acc_csv = get_test_data('acc.csv')
output = runtmp.output('simple.zip')
Expand Down
40 changes: 40 additions & 0 deletions tests/test_urlsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,46 @@ def test_urlsketch_simple(runtmp):
assert range == ""


def test_urlsketch_simple_default_failed(runtmp):
# check default value for --failed
acc_csv = get_test_data('acc-url.csv')
output = runtmp.output('simple.zip')
failed = runtmp.output('acc-url.csv.fail.csv')

sig1 = get_test_data('GCA_000175535.1.sig.gz')
sig2 = get_test_data('GCA_000961135.2.sig.gz')
sig3 = get_test_data('GCA_000961135.2.protein.sig.gz')
ss1 = sourmash.load_one_signature(sig1, ksize=31)
ss2 = sourmash.load_one_signature(sig2, ksize=31)
ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein')

runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output,
'-r', '1',
'--param-str', "dna,k=31,scaled=1000",
'-p', "protein,k=10,scaled=200",
in_dir=runtmp.output(''))

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty

idx = sourmash.load_file_as_index(output)
sigs = list(idx.signatures())

assert os.path.exists(failed)
with open(failed, 'r') as failF:
header = next(failF).strip()
assert header == "accession,name,moltype,md5sum,download_filename,url,range"
for line in failF:
print(line)
acc, name, moltype, md5sum, download_filename, url, range = line.strip().split(',')
assert acc == "GCA_000175535.1"
assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14"
assert moltype == "protein"
assert download_filename == "GCA_000175535.1_protein.faa.gz"
assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz"
assert range == ""


def test_urlsketch_manifest(runtmp, capfd):
acc_csv = get_test_data('acc-url.csv')
output = runtmp.output('simple.zip')
Expand Down

0 comments on commit ba5d26d

Please sign in to comment.