Merge pull request #37 from MicrobialDarkMatter/MTaseLinker

MTase linker 4mC update
MicrobialDarkMatter · May 13, 2024 · 3325216 · 3325216
2 parents 0ee1158 + 5277708
commit 3325216
Show file tree

Hide file tree

Showing 11 changed files with 147 additions and 184 deletions.
diff --git a/README.md b/README.md
@@ -105,11 +105,9 @@ The installation requires conda to generate a few environments.
 When the additional dependencies are installed you can run the workflow using `MTase-linker run`
 
 ```
-nanomotif MTase-linker run -t 10 --binsdir bins_directory --contig_bin contig_bin.tsv --bin_motifs nanomotif/bin_motifs.tsv -d ML_dependencies -o mtase_linker
+nanomotif MTase-linker run -t 10 --assembly ASSEMBLY.fasta --contig_bin contig_bin.tsv --bin_motifs nanomotif/bin_motifs.tsv -d ML_dependencies -o mtase_linker
 ```
 
-Please be aware that the `--binsdir` flag should be followed by the path leading to the parent directory of the assembly og bin files, and that these files should have a .fa extension.
-
 Running the nanomotif MTase-linker run command will generate two primary output files: mtase_assignment_table.tsv and nanomotif_assignment_table.tsv. The first file lists all predicted MTase genes in the genome along with their predicted methylation characteristics and whether the module was able to unambiguously assign any detected motifs to the MTase (`linked` = (True/False)).
 The second file includes data from the bin-motifs.tsv of the nanomotif output with two additional columns `linked` and `candidate_genes`. The `linked` variable is a boolean indicator if the motif could be unambiguously linked to a MTase in the bin/genome (TRUE/FALSE). If True the gene_id of the MTase is provided in `candidate_gene`. If False, the `candidate_gene` variable lists feasible candidate facilitators of the modification based on motif type and modification type predictions.
 

diff --git a/nanomotif/_version.py b/nanomotif/_version.py
@@ -1 +1 @@
-__version__ = "0.4.1"
+__version__ = "0.4.2"
diff --git a/nanomotif/argparser.py b/nanomotif/argparser.py
@@ -173,7 +173,7 @@ def  create_parser():
     parser_mtase_linker_run.add_argument("-t", "--threads", type=int, default=1, help="Number of threads to use. Default is 1")
     parser_mtase_linker_run.add_argument("--forceall", type=bool, default=False, help="Forcerun workflow regardless of already created output (default = False)")
     parser_mtase_linker_run.add_argument("--dryrun", type=bool, default=False, help="Dry-run the workflow. Default is False")
-    parser_mtase_linker_run.add_argument("--binsdir", type=str, required=True, help="Directory with bin files or assembly files. Needs to have the .fa extension.")
+    parser_mtase_linker_run.add_argument("--assembly", type=str, required=True, help="Path to assembly file.")
     parser_mtase_linker_run.add_argument("--contig_bin", type=str, required=True, help="tsv file specifying which bin contigs belong.")
     parser_mtase_linker_run.add_argument("--bin_motifs", type=str, required=True, help="bin-motifs.tsv output from nanomotif.")
     parser_mtase_linker_run.add_argument("-d", "--dependency_dir", type=str, default=os.path.join(os.getcwd(), "ML_dependencies"), help="Same as for installation. Path to directory of the ML_dependencies directory created during installation of the MTase-linker module. Default is cwd/ML_dependencies")

diff --git a/nanomotif/mtase_linker/MTase_linker.smk b/nanomotif/mtase_linker/MTase_linker.smk
@@ -6,11 +6,13 @@ import os
 #configfile: "config/mtase_linker_config.yaml"
 
 # Loading unique bin names
-bin_ids = glob_wildcards(os.path.join(config["BINSDIR"], "{binname}.fa")).binname
+#bin_ids = glob_wildcards(os.path.join(config["BINSDIR"], "{binname}.fa")).binname
 
 # Defining output directory
 OUTPUTDIR = config["OUTPUTDIRECTORY"]
 DEPENDENCYDIR = config["DEPENDENCY_PATH"]
+ASSEMBLY_PATH = config["ASSEMBLY"]
+BASENAME = os.path.splitext(os.path.basename(ASSEMBLY_PATH))[0]
 
 #################################################################################
                         #MTase-Linker Pipeline
@@ -23,17 +25,14 @@ rule all:
 
 
 
-
-
-
 # Running prodigal to extract protein sequences
 
 rule prodigal:
      input:
-          os.path.join(config["BINSDIR"], "{binname}.fa")
+          ASSEMBLY_PATH
      output:
-          gene_coords = os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.gff"),
-          protein_translations = os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.faa")
+          gene_coords = os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}.gff"),
+          protein_translations = os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}.faa")
      threads: config["THREADS"]
      conda: 
           "envs/prodigal-2.6.3.yaml"
@@ -46,9 +45,9 @@ rule prodigal:
 # Removing asterisk from AA sequence ends in prodigal sequence file
 rule process_prodigal:
     input:
-        os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.faa")
+        os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}.faa")
     output:
-        os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.processed.faa")
+        os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}_processed.faa")
     shell:
         """
         sed '/^[^>]/ s/\*$//' {input} > {output}
@@ -58,14 +57,14 @@ rule process_prodigal:
 # Defensefinder to extract MTase genes
 rule defenseFinder:
     input:
-        faa =os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.processed.faa"),
+        faa =os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}_processed.faa"),
         #dummy = os.path.join(OUTPUTDIR,"defense-finder_update.done")
     output: 
-        hmmer_file = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.processed_defense_finder_hmmer.tsv"),
-        system_file = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.processed_defense_finder_systems.tsv")
+        hmmer_file = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_hmmer.tsv"),
+        system_file = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_systems.tsv")
     params:
         modelsdir = os.path.join(DEPENDENCYDIR, "df_models"),
-        outputdir = os.path.join(OUTPUTDIR, "defensefinder/{binname}")
+        outputdir = os.path.join(OUTPUTDIR, "defensefinder")
     threads: config["THREADS"]
     conda:
         "envs/defensefinder-1.2.0.yaml"
@@ -78,12 +77,12 @@ rule defenseFinder:
 # Extracting MTase protein ids and AA sequences with defensefinder output
 rule extract_MTase_protein_seqs:
     input:
-        hmmer_file = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.processed_defense_finder_hmmer.tsv"),
-        system_file = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.processed_defense_finder_systems.tsv"),
-        prodigal_AAs = os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.processed.faa")
+        hmmer_file = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_hmmer.tsv"),
+        system_file = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_systems.tsv"),
+        prodigal_AAs = os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}_processed.faa")
     output:
-        DF_MTase_table = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.tsv"),
-        DF_MTase_AA = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.faa")
+        DF_MTase_table = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_mtase.tsv"),
+        DF_MTase_AA = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_mtase.faa")
     conda:
         "envs/python_env-3.12.0.yaml"
     script:
@@ -92,35 +91,35 @@ rule extract_MTase_protein_seqs:
 
 
 # Combining DefenseFinder results for all bins.
-rule combine_DF_MTase_files:
-    input:
-        tsv_files = expand(os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.tsv"), binname=bin_ids),
-        faa_files = expand(os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.faa"), binname=bin_ids)
-    output:
-        tsv_file = os.path.join(OUTPUTDIR, "defensefinder/defense_finder_mtase_all.tsv"),
-        faa_file = os.path.join(OUTPUTDIR, "defensefinder/defense_finder_mtase_all.faa")
-    shell:
-        """
-        # Print the header line
-        echo -e "hit_id\treplicon\thit_pos\thit_sequence_length\tgene_name\ti_eval\thit_score\thit_profile_cov\thit_seq_cov\thit_begin_match\thit_end_match\tRM_system" > {output.tsv_file}
-        # Loop through files and append their contents minus the first line
-        for file in {input.tsv_files}; do
-            tail -n +2 "$file" >> {output.tsv_file}
-        done
-
-        for file in {input.faa_files}; do
-            tail -n +1 "$file" >> {output.faa_file}
-        done
-        """
+# rule combine_DF_MTase_files:
+#     input:
+#         tsv_files = expand(os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.tsv"), binname=bin_ids),
+#         faa_files = expand(os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.faa"), binname=bin_ids)
+#     output:
+#         tsv_file = os.path.join(OUTPUTDIR, "defensefinder/defense_finder_mtase_all.tsv"),
+#         faa_file = os.path.join(OUTPUTDIR, "defensefinder/defense_finder_mtase_all.faa")
+#     shell:
+#         """
+#         # Print the header line
+#         echo -e "hit_id\treplicon\thit_pos\thit_sequence_length\tgene_name\ti_eval\thit_score\thit_profile_cov\thit_seq_cov\thit_begin_match\thit_end_match\tRM_system" > {output.tsv_file}
+#         # Loop through files and append their contents minus the first line
+#         for file in {input.tsv_files}; do
+#             tail -n +2 "$file" >> {output.tsv_file}
+#         done
+
+#         for file in {input.faa_files}; do
+#             tail -n +1 "$file" >> {output.faa_file}
+#         done
+#         """
 
 
 # Blasting MTase sequences against REbase
 rule blastp_REbase:
     input: 
-        query = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.faa")
+        query = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_mtase.faa")
     output:
-        blast = temp(os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_alignment_tmp.tsv")),
-        blast_header = os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_alignment.tsv")
+        blast = temp(os.path.join(OUTPUTDIR, "blastp", f"{BASENAME}_rebase_mtase_alignment_tmp.tsv")),
+        blast_header = os.path.join(OUTPUTDIR, "blastp", f"{BASENAME}_rebase_mtase_alignment.tsv")
     params: 
         max_target_seqs=100,
         db = os.path.join(DEPENDENCYDIR, "REbase_RecSeq_all", "REbase_RecSeq_all")
@@ -147,9 +146,9 @@ rule blastp_REbase:
 # Retrieving significant BLASTP hits for motif guess
 rule blastp_sign_hits:
     input:
-        os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_alignment.tsv")
+        os.path.join(OUTPUTDIR, "blastp", f"{BASENAME}_rebase_mtase_alignment.tsv")
     output:
-        os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_sign_alignment.tsv")
+        os.path.join(OUTPUTDIR, "blastp", f"{BASENAME}_rebase_mtase_sign_alignment.tsv")
     params: 
         identity = config["IDENTITY"],
         qcovs = config["QCOVS"]
@@ -161,30 +160,30 @@ rule blastp_sign_hits:
 
 
 # Combining significant BLASTP results for all bins.
-rule combine_BLASTP_files:
-    input:
-        expand(os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_sign_alignment.tsv"), binname=bin_ids),
-    output:
-        os.path.join(OUTPUTDIR, "blastp/rebase_mtase_sign_alignment_all.tsv"),
-    shell:
-        """
-        # Print the header line
-        echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tqcovs\tsalltitles\tREbase_id\tmotif" > {output}
-        # Loop through files and append their contents minus the first line
-        for file in {input}; do
-            tail -n +2 "$file" >> {output}
-        done
-        """
+# rule combine_BLASTP_files:
+#     input:
+#         expand(os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_sign_alignment.tsv"), binname=bin_ids),
+#     output:
+#         os.path.join(OUTPUTDIR, "blastp/rebase_mtase_sign_alignment_all.tsv"),
+#     shell:
+#         """
+#         # Print the header line
+#         echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tqcovs\tsalltitles\tREbase_id\tmotif" > {output}
+#         # Loop through files and append their contents minus the first line
+#         for file in {input}; do
+#             tail -n +2 "$file" >> {output}
+#         done
+#         """
 
 
 
 # Mod type predictions
 rule run_pfam_hmm:
     input:
-        DF_MTase_AA = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.faa")
+        DF_MTase_AA = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_mtase.faa")
     output:
-        hmmsearch = os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.raw_hmm_hits_mtase_aaseqs.hits"),
-        hits = os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.hmm_hits_mtase_aaseqs.tsv")
+        hmmsearch = os.path.join(OUTPUTDIR, "pfam_hmm_hits", f"{BASENAME}_raw_hmm_hits_mtase_aaseqs.hits"),
+        hits = os.path.join(OUTPUTDIR, "pfam_hmm_hits", f"{BASENAME}_hmm_hits_mtase_aaseqs.tsv")
     params: 
         hmm_profiles = os.path.join(DEPENDENCYDIR, "PFAM_MTase_profiles.hmm")
     threads: config["THREADS"]
@@ -211,9 +210,9 @@ rule run_pfam_hmm:
 # Converting pfam hits to mod type predictions:
 rule mod_type_prediction_table:
     input:
-        os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.hmm_hits_mtase_aaseqs.tsv")
+        os.path.join(OUTPUTDIR, "pfam_hmm_hits", f"{BASENAME}_hmm_hits_mtase_aaseqs.tsv")
     output:
-        os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.gene_id_mod_table.tsv")
+        os.path.join(OUTPUTDIR, "pfam_hmm_hits", f"{BASENAME}_gene_id_mod_table.tsv")
     conda:
         "envs/python_env-3.12.0.yaml"
     script:
@@ -222,28 +221,28 @@ rule mod_type_prediction_table:
 
 
 # Combining modtype prediction for all bins.
-rule combine_modtype_files:
-    input:
-        expand(os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.gene_id_mod_table.tsv"), binname=bin_ids),
-    output:
-        os.path.join(OUTPUTDIR, "pfam_hmm_hits/gene_id_mod_table_all.tsv"),
-    shell:
-        """
-        # Print the header line
-        echo -e "gene_id\tmod_type" > {output}
-        # Loop through files and append their contents minus the first line
-        for file in {input}; do
-            tail -n +2 "$file" >> {output}
-        done
-        """
+# rule combine_modtype_files:
+#     input:
+#         expand(os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.gene_id_mod_table.tsv"), binname=bin_ids),
+#     output:
+#         os.path.join(OUTPUTDIR, "pfam_hmm_hits/gene_id_mod_table_all.tsv"),
+#     shell:
+#         """
+#         # Print the header line
+#         echo -e "gene_id\tmod_type" > {output}
+#         # Loop through files and append their contents minus the first line
+#         for file in {input}; do
+#             tail -n +2 "$file" >> {output}
+#         done
+#         """
 
 
 # Generating final output
 rule motif_assignment:
     input:
-        mod_table_path = os.path.join(OUTPUTDIR, "pfam_hmm_hits/gene_id_mod_table_all.tsv"),
-        DefenseFinder_path = os.path.join(OUTPUTDIR, "defensefinder/defense_finder_mtase_all.tsv"),
-        BLASTP_path = os.path.join(OUTPUTDIR, "blastp/rebase_mtase_sign_alignment_all.tsv"),
+        mod_table_path = os.path.join(OUTPUTDIR, "pfam_hmm_hits", f"{BASENAME}_gene_id_mod_table.tsv"),
+        DefenseFinder_path = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_mtase.tsv"),
+        BLASTP_path = os.path.join(OUTPUTDIR, "blastp", f"{BASENAME}_rebase_mtase_sign_alignment.tsv"),
         nanomotif_table_path = config["NANOMOTIF"],
         contig_bin = config["CONTIG_BIN"]
     output:

diff --git a/nanomotif/mtase_linker/__init__.py b/nanomotif/mtase_linker/__init__.py
@@ -1,2 +1,2 @@
 _program = "MTase-linker"
-__version__ = "0.1.0"
+__version__ = "0.2.1"
diff --git a/nanomotif/mtase_linker/command.py b/nanomotif/mtase_linker/command.py
@@ -19,22 +19,14 @@ def run_MTase_linker(args):
         sys.exit(1)
 
     # Bins directory
-    a1 = os.path.join(cwd, args.binsdir)
-    a2 = os.path.join(args.binsdir)
-    if os.path.exists(a1) and os.path.isdir(a1) and glob.glob(os.path.join(a1, '*.fa')):
-        binsdir = a1
-    elif os.path.exists(a2) and os.path.isdir(a2) and glob.glob(os.path.join(a2, '*.fa')):
-        binsdir = a2
-    elif os.path.exists(a1) and os.path.isdir(a1):
-        msg = 'Error: directory {} does not contain .fa files '.format(b1)
-        sys.stderr.write(msg)
-        sys.exit(1)
-    elif os.path.exists(a2) and os.path.isdir(a2):
-        msg = 'Error: directory {} does not contain .fa files '.format(a2)
-        sys.stderr.write(msg)
-        sys.exit(1)
+    a1 = os.path.join(cwd, args.assembly)
+    a2 = os.path.join(args.assembly)
+    if os.path.exists(a1):
+        assembly_path = a1
+    elif os.path.exists(a2):
+        assembly_path = a2
     else:
-        msg = 'Error: cannot find bins directory in paths {} or {} '.format(a1,a2)
+        msg = 'Error: cannot find assembly files in paths {} or {} '.format(a1,a2)
         sys.stderr.write(msg)
         sys.exit(1)
 
@@ -110,7 +102,7 @@ def run_MTase_linker(args):
 
     workflow = None
     workflow = {"THREADS": args.threads,
-                "BINSDIR": binsdir,
+                "ASSEMBLY": assembly_path,
                 "CONTIG_BIN": contig_bin,
                 "OUTPUTDIRECTORY": args.outputdir,
                 "DEPENDENCY_PATH": dependency_dir,

diff --git a/nanomotif/mtase_linker/mtase_linker_config.yaml b/nanomotif/mtase_linker/mtase_linker_config.yaml