Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Put all db files in .unassigner/ by default #40

Merged
merged 6 commits into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Default database directory
.unassigner/

# LTP refs
LTP_*.csv
LTP_*.fasta
Expand Down
10 changes: 0 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,16 +115,6 @@ Step 3: The last part of the software relies on building a database of the seque
Please see the output of `trimragged --help` for a list of the available
options.

### Count mismatches



### Percent ID ANI sample



Should there also be a command and section for prepare_strain_data?

## Contributing

We welcome ideas from our users about how to improve this
Expand Down
34 changes: 20 additions & 14 deletions unassigner/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,23 @@ def main(argv=None):
"--output_dir",
help=(
"Output directory (default: basename of query sequences FASTA "
"file, plus '_unassigned')"
"file, plus '_unassigned'. Note that it will be in the same "
"directory as the query sequences FASTA file)."
),
)
p.add_argument(
"--type_strain_fasta",
default="unassigner_species.fasta",
help=(
"Type strain sequences FASTA file (default: %(default)s). "
"If the default file is not found, sequences are downloaded "
"and re-formatted automatically."
"FASTA file containing sequences of type strains. If not provided, "
"the default database is used. Note that this WILL NOT DOWNLOAD a new db."
),
)
p.add_argument(
"--db_dir",
default=".unassigner/",
help=(
"Directory containing the reference database. If not provided, "
"the default database is used."
),
)
p.add_argument(
Expand Down Expand Up @@ -91,24 +98,23 @@ def main(argv=None):
output_dir = args.output_dir

# Download type strain files if needed
type_strain_fp_is_default = args.type_strain_fasta == p.get_default(
"type_strain_fasta"
)
type_strain_fp_is_missing = not os.path.exists(args.type_strain_fasta)
if type_strain_fp_is_default and type_strain_fp_is_missing:
download_type_strain_data()
os.makedirs(args.db_dir, exist_ok=True)
_, _, ltp_fp = download_type_strain_data(output_dir=args.db_dir)

if args.type_strain_fasta is not None:
ltp_fp = args.type_strain_fasta

with open(args.type_strain_fasta) as f:
with open(ltp_fp) as f:
species_names = dict(parse_species_names(f))

writer = OutputWriter(output_dir, species_names)

alignment_query_fp = writer.output_fp("unassigner_query.fasta")
alignment_output_fp = writer.output_fp("unassigner_query_hits.txt")
if os.path.exists(alignment_output_fp):
a = FileAligner(args.type_strain_fasta, alignment_output_fp)
a = FileAligner(ltp_fp, alignment_output_fp)
else:
a = UnassignAligner(args.type_strain_fasta)
a = UnassignAligner(ltp_fp)
a.species_input_fp = alignment_query_fp
a.species_output_fp = alignment_output_fp
a.num_cpus = args.num_cpus
Expand Down
8 changes: 6 additions & 2 deletions unassigner/prepare_strain_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ def main(argv=None):
action="store_true",
help=("Remove all downloaded and processed files."),
)
p.add_argument("--db-dir", help=("Filepath to download the files to."))
p.add_argument(
"--db-dir", default=".autobfx/", help=("Filepath to download the files to.")
)
args = p.parse_args(argv)

if args.db_dir:
Expand Down Expand Up @@ -94,4 +96,6 @@ def download_type_strain_data(output_dir=None, metadata_fp=None, seqs_fp=None):
output_dir = os.getcwd()
metadata_fp = use_or_download(metadata_fp, LTP_METADATA_URL, output_dir)
seqs_fp = use_or_download(seqs_fp, LTP_SEQS_URL, output_dir)
return process_ltp_seqs(seqs_fp, output_dir)
ltp_fp = process_ltp_seqs(seqs_fp, output_dir)

return metadata_fp, seqs_fp, ltp_fp
Loading