PennChopMicrobiomeProgram · kylebittinger · Nov 29, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 20, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# Default database directory
+.unassigner/
+
 # LTP refs
 LTP_*.csv
 LTP_*.fasta

diff --git a/README.md b/README.md
@@ -115,16 +115,6 @@ Step 3: The last part of the software relies on building a database of the seque
 Please see the output of `trimragged --help` for a list of the available
 options.
 
-### Count mismatches
-
-
-
-### Percent ID ANI sample
-
-
-
-Should there also be a command and section for prepare_strain_data?
-
 ## Contributing
 
 We welcome ideas from our users about how to improve this

diff --git a/unassigner/command.py b/unassigner/command.py
@@ -27,16 +27,23 @@ def main(argv=None):
         "--output_dir",
         help=(
             "Output directory (default: basename of query sequences FASTA "
-            "file, plus '_unassigned')"
+            "file, plus '_unassigned'. Note that it will be in the same "
+            "directory as the query sequences FASTA file)."
         ),
     )
     p.add_argument(
         "--type_strain_fasta",
-        default="unassigner_species.fasta",
         help=(
-            "Type strain sequences FASTA file (default: %(default)s). "
-            "If the default file is not found, sequences are downloaded "
-            "and re-formatted automatically."
+            "FASTA file containing sequences of type strains. If not provided, "
+            "the default database is used. Note that this WILL NOT DOWNLOAD a new db."
+        ),
+    )
+    p.add_argument(
+        "--db_dir",
+        default=".unassigner/",
+        help=(
+            "Directory containing the reference database. If not provided, "
+            "the default database is used."
         ),
     )
     p.add_argument(
@@ -91,24 +98,23 @@ def main(argv=None):
         output_dir = args.output_dir
 
     # Download type strain files if needed
-    type_strain_fp_is_default = args.type_strain_fasta == p.get_default(
-        "type_strain_fasta"
-    )
-    type_strain_fp_is_missing = not os.path.exists(args.type_strain_fasta)
-    if type_strain_fp_is_default and type_strain_fp_is_missing:
-        download_type_strain_data()
+    os.makedirs(args.db_dir, exist_ok=True)
+    _, _, ltp_fp = download_type_strain_data(output_dir=args.db_dir)
+
+    if args.type_strain_fasta is not None:
+        ltp_fp = args.type_strain_fasta
 
-    with open(args.type_strain_fasta) as f:
+    with open(ltp_fp) as f:
         species_names = dict(parse_species_names(f))
 
     writer = OutputWriter(output_dir, species_names)
 
     alignment_query_fp = writer.output_fp("unassigner_query.fasta")
     alignment_output_fp = writer.output_fp("unassigner_query_hits.txt")
     if os.path.exists(alignment_output_fp):
-        a = FileAligner(args.type_strain_fasta, alignment_output_fp)
+        a = FileAligner(ltp_fp, alignment_output_fp)
     else:
-        a = UnassignAligner(args.type_strain_fasta)
+        a = UnassignAligner(ltp_fp)
         a.species_input_fp = alignment_query_fp
         a.species_output_fp = alignment_output_fp
         a.num_cpus = args.num_cpus

diff --git a/unassigner/prepare_strain_data.py b/unassigner/prepare_strain_data.py
@@ -63,7 +63,9 @@ def main(argv=None):
         action="store_true",
         help=("Remove all downloaded and processed files."),
     )
-    p.add_argument("--db-dir", help=("Filepath to download the files to."))
+    p.add_argument(
+        "--db-dir", default=".autobfx/", help=("Filepath to download the files to.")
+    )
     args = p.parse_args(argv)
 
     if args.db_dir:
@@ -94,4 +96,6 @@ def download_type_strain_data(output_dir=None, metadata_fp=None, seqs_fp=None):
         output_dir = os.getcwd()
     metadata_fp = use_or_download(metadata_fp, LTP_METADATA_URL, output_dir)
     seqs_fp = use_or_download(seqs_fp, LTP_SEQS_URL, output_dir)
-    return process_ltp_seqs(seqs_fp, output_dir)
+    ltp_fp = process_ltp_seqs(seqs_fp, output_dir)
+
+    return metadata_fp, seqs_fp, ltp_fp