Skip to content

Commit

Permalink
Merge pull request #37 from MicrobialDarkMatter/MTaseLinker
Browse files Browse the repository at this point in the history
MTase linker 4mC update
  • Loading branch information
SorenHeidelbach authored May 13, 2024
2 parents 0ee1158 + 5277708 commit 3325216
Show file tree
Hide file tree
Showing 11 changed files with 147 additions and 184 deletions.
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,9 @@ The installation requires conda to generate a few environments.
When the additional dependencies are installed you can run the workflow using `MTase-linker run`

```
nanomotif MTase-linker run -t 10 --binsdir bins_directory --contig_bin contig_bin.tsv --bin_motifs nanomotif/bin_motifs.tsv -d ML_dependencies -o mtase_linker
nanomotif MTase-linker run -t 10 --assembly ASSEMBLY.fasta --contig_bin contig_bin.tsv --bin_motifs nanomotif/bin_motifs.tsv -d ML_dependencies -o mtase_linker
```

Please be aware that the `--binsdir` flag should be followed by the path leading to the parent directory of the assembly og bin files, and that these files should have a .fa extension.

Running the nanomotif MTase-linker run command will generate two primary output files: mtase_assignment_table.tsv and nanomotif_assignment_table.tsv. The first file lists all predicted MTase genes in the genome along with their predicted methylation characteristics and whether the module was able to unambiguously assign any detected motifs to the MTase (`linked` = (True/False)).
The second file includes data from the bin-motifs.tsv of the nanomotif output with two additional columns `linked` and `candidate_genes`. The `linked` variable is a boolean indicator if the motif could be unambiguously linked to a MTase in the bin/genome (TRUE/FALSE). If True the gene_id of the MTase is provided in `candidate_gene`. If False, the `candidate_gene` variable lists feasible candidate facilitators of the modification based on motif type and modification type predictions.

Expand Down
2 changes: 1 addition & 1 deletion nanomotif/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.1"
__version__ = "0.4.2"
2 changes: 1 addition & 1 deletion nanomotif/argparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def create_parser():
parser_mtase_linker_run.add_argument("-t", "--threads", type=int, default=1, help="Number of threads to use. Default is 1")
parser_mtase_linker_run.add_argument("--forceall", type=bool, default=False, help="Forcerun workflow regardless of already created output (default = False)")
parser_mtase_linker_run.add_argument("--dryrun", type=bool, default=False, help="Dry-run the workflow. Default is False")
parser_mtase_linker_run.add_argument("--binsdir", type=str, required=True, help="Directory with bin files or assembly files. Needs to have the .fa extension.")
parser_mtase_linker_run.add_argument("--assembly", type=str, required=True, help="Path to assembly file.")
parser_mtase_linker_run.add_argument("--contig_bin", type=str, required=True, help="tsv file specifying which bin contigs belong.")
parser_mtase_linker_run.add_argument("--bin_motifs", type=str, required=True, help="bin-motifs.tsv output from nanomotif.")
parser_mtase_linker_run.add_argument("-d", "--dependency_dir", type=str, default=os.path.join(os.getcwd(), "ML_dependencies"), help="Same as for installation. Path to directory of the ML_dependencies directory created during installation of the MTase-linker module. Default is cwd/ML_dependencies")
Expand Down
157 changes: 78 additions & 79 deletions nanomotif/mtase_linker/MTase_linker.smk
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ import os
#configfile: "config/mtase_linker_config.yaml"

# Loading unique bin names
bin_ids = glob_wildcards(os.path.join(config["BINSDIR"], "{binname}.fa")).binname
#bin_ids = glob_wildcards(os.path.join(config["BINSDIR"], "{binname}.fa")).binname

# Defining output directory
OUTPUTDIR = config["OUTPUTDIRECTORY"]
DEPENDENCYDIR = config["DEPENDENCY_PATH"]
ASSEMBLY_PATH = config["ASSEMBLY"]
BASENAME = os.path.splitext(os.path.basename(ASSEMBLY_PATH))[0]

#################################################################################
#MTase-Linker Pipeline
Expand All @@ -23,17 +25,14 @@ rule all:






# Running prodigal to extract protein sequences

rule prodigal:
input:
os.path.join(config["BINSDIR"], "{binname}.fa")
ASSEMBLY_PATH
output:
gene_coords = os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.gff"),
protein_translations = os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.faa")
gene_coords = os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}.gff"),
protein_translations = os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}.faa")
threads: config["THREADS"]
conda:
"envs/prodigal-2.6.3.yaml"
Expand All @@ -46,9 +45,9 @@ rule prodigal:
# Removing asterisk from AA sequence ends in prodigal sequence file
rule process_prodigal:
input:
os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.faa")
os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}.faa")
output:
os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.processed.faa")
os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}_processed.faa")
shell:
"""
sed '/^[^>]/ s/\*$//' {input} > {output}
Expand All @@ -58,14 +57,14 @@ rule process_prodigal:
# Defensefinder to extract MTase genes
rule defenseFinder:
input:
faa =os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.processed.faa"),
faa =os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}_processed.faa"),
#dummy = os.path.join(OUTPUTDIR,"defense-finder_update.done")
output:
hmmer_file = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.processed_defense_finder_hmmer.tsv"),
system_file = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.processed_defense_finder_systems.tsv")
hmmer_file = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_hmmer.tsv"),
system_file = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_systems.tsv")
params:
modelsdir = os.path.join(DEPENDENCYDIR, "df_models"),
outputdir = os.path.join(OUTPUTDIR, "defensefinder/{binname}")
outputdir = os.path.join(OUTPUTDIR, "defensefinder")
threads: config["THREADS"]
conda:
"envs/defensefinder-1.2.0.yaml"
Expand All @@ -78,12 +77,12 @@ rule defenseFinder:
# Extracting MTase protein ids and AA sequences with defensefinder output
rule extract_MTase_protein_seqs:
input:
hmmer_file = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.processed_defense_finder_hmmer.tsv"),
system_file = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.processed_defense_finder_systems.tsv"),
prodigal_AAs = os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.processed.faa")
hmmer_file = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_hmmer.tsv"),
system_file = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_systems.tsv"),
prodigal_AAs = os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}_processed.faa")
output:
DF_MTase_table = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.tsv"),
DF_MTase_AA = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.faa")
DF_MTase_table = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_mtase.tsv"),
DF_MTase_AA = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_mtase.faa")
conda:
"envs/python_env-3.12.0.yaml"
script:
Expand All @@ -92,35 +91,35 @@ rule extract_MTase_protein_seqs:


# Combining DefenseFinder results for all bins.
rule combine_DF_MTase_files:
input:
tsv_files = expand(os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.tsv"), binname=bin_ids),
faa_files = expand(os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.faa"), binname=bin_ids)
output:
tsv_file = os.path.join(OUTPUTDIR, "defensefinder/defense_finder_mtase_all.tsv"),
faa_file = os.path.join(OUTPUTDIR, "defensefinder/defense_finder_mtase_all.faa")
shell:
"""
# Print the header line
echo -e "hit_id\treplicon\thit_pos\thit_sequence_length\tgene_name\ti_eval\thit_score\thit_profile_cov\thit_seq_cov\thit_begin_match\thit_end_match\tRM_system" > {output.tsv_file}
# Loop through files and append their contents minus the first line
for file in {input.tsv_files}; do
tail -n +2 "$file" >> {output.tsv_file}
done
for file in {input.faa_files}; do
tail -n +1 "$file" >> {output.faa_file}
done
"""
# rule combine_DF_MTase_files:
# input:
# tsv_files = expand(os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.tsv"), binname=bin_ids),
# faa_files = expand(os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.faa"), binname=bin_ids)
# output:
# tsv_file = os.path.join(OUTPUTDIR, "defensefinder/defense_finder_mtase_all.tsv"),
# faa_file = os.path.join(OUTPUTDIR, "defensefinder/defense_finder_mtase_all.faa")
# shell:
# """
# # Print the header line
# echo -e "hit_id\treplicon\thit_pos\thit_sequence_length\tgene_name\ti_eval\thit_score\thit_profile_cov\thit_seq_cov\thit_begin_match\thit_end_match\tRM_system" > {output.tsv_file}
# # Loop through files and append their contents minus the first line
# for file in {input.tsv_files}; do
# tail -n +2 "$file" >> {output.tsv_file}
# done

# for file in {input.faa_files}; do
# tail -n +1 "$file" >> {output.faa_file}
# done
# """


# Blasting MTase sequences against REbase
rule blastp_REbase:
input:
query = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.faa")
query = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_mtase.faa")
output:
blast = temp(os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_alignment_tmp.tsv")),
blast_header = os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_alignment.tsv")
blast = temp(os.path.join(OUTPUTDIR, "blastp", f"{BASENAME}_rebase_mtase_alignment_tmp.tsv")),
blast_header = os.path.join(OUTPUTDIR, "blastp", f"{BASENAME}_rebase_mtase_alignment.tsv")
params:
max_target_seqs=100,
db = os.path.join(DEPENDENCYDIR, "REbase_RecSeq_all", "REbase_RecSeq_all")
Expand All @@ -147,9 +146,9 @@ rule blastp_REbase:
# Retrieving significant BLASTP hits for motif guess
rule blastp_sign_hits:
input:
os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_alignment.tsv")
os.path.join(OUTPUTDIR, "blastp", f"{BASENAME}_rebase_mtase_alignment.tsv")
output:
os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_sign_alignment.tsv")
os.path.join(OUTPUTDIR, "blastp", f"{BASENAME}_rebase_mtase_sign_alignment.tsv")
params:
identity = config["IDENTITY"],
qcovs = config["QCOVS"]
Expand All @@ -161,30 +160,30 @@ rule blastp_sign_hits:


# Combining significant BLASTP results for all bins.
rule combine_BLASTP_files:
input:
expand(os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_sign_alignment.tsv"), binname=bin_ids),
output:
os.path.join(OUTPUTDIR, "blastp/rebase_mtase_sign_alignment_all.tsv"),
shell:
"""
# Print the header line
echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tqcovs\tsalltitles\tREbase_id\tmotif" > {output}
# Loop through files and append their contents minus the first line
for file in {input}; do
tail -n +2 "$file" >> {output}
done
"""
# rule combine_BLASTP_files:
# input:
# expand(os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_sign_alignment.tsv"), binname=bin_ids),
# output:
# os.path.join(OUTPUTDIR, "blastp/rebase_mtase_sign_alignment_all.tsv"),
# shell:
# """
# # Print the header line
# echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tqcovs\tsalltitles\tREbase_id\tmotif" > {output}
# # Loop through files and append their contents minus the first line
# for file in {input}; do
# tail -n +2 "$file" >> {output}
# done
# """



# Mod type predictions
rule run_pfam_hmm:
input:
DF_MTase_AA = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.faa")
DF_MTase_AA = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_mtase.faa")
output:
hmmsearch = os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.raw_hmm_hits_mtase_aaseqs.hits"),
hits = os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.hmm_hits_mtase_aaseqs.tsv")
hmmsearch = os.path.join(OUTPUTDIR, "pfam_hmm_hits", f"{BASENAME}_raw_hmm_hits_mtase_aaseqs.hits"),
hits = os.path.join(OUTPUTDIR, "pfam_hmm_hits", f"{BASENAME}_hmm_hits_mtase_aaseqs.tsv")
params:
hmm_profiles = os.path.join(DEPENDENCYDIR, "PFAM_MTase_profiles.hmm")
threads: config["THREADS"]
Expand All @@ -211,9 +210,9 @@ rule run_pfam_hmm:
# Converting pfam hits to mod type predictions:
rule mod_type_prediction_table:
input:
os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.hmm_hits_mtase_aaseqs.tsv")
os.path.join(OUTPUTDIR, "pfam_hmm_hits", f"{BASENAME}_hmm_hits_mtase_aaseqs.tsv")
output:
os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.gene_id_mod_table.tsv")
os.path.join(OUTPUTDIR, "pfam_hmm_hits", f"{BASENAME}_gene_id_mod_table.tsv")
conda:
"envs/python_env-3.12.0.yaml"
script:
Expand All @@ -222,28 +221,28 @@ rule mod_type_prediction_table:


# Combining modtype prediction for all bins.
rule combine_modtype_files:
input:
expand(os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.gene_id_mod_table.tsv"), binname=bin_ids),
output:
os.path.join(OUTPUTDIR, "pfam_hmm_hits/gene_id_mod_table_all.tsv"),
shell:
"""
# Print the header line
echo -e "gene_id\tmod_type" > {output}
# Loop through files and append their contents minus the first line
for file in {input}; do
tail -n +2 "$file" >> {output}
done
"""
# rule combine_modtype_files:
# input:
# expand(os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.gene_id_mod_table.tsv"), binname=bin_ids),
# output:
# os.path.join(OUTPUTDIR, "pfam_hmm_hits/gene_id_mod_table_all.tsv"),
# shell:
# """
# # Print the header line
# echo -e "gene_id\tmod_type" > {output}
# # Loop through files and append their contents minus the first line
# for file in {input}; do
# tail -n +2 "$file" >> {output}
# done
# """


# Generating final output
rule motif_assignment:
input:
mod_table_path = os.path.join(OUTPUTDIR, "pfam_hmm_hits/gene_id_mod_table_all.tsv"),
DefenseFinder_path = os.path.join(OUTPUTDIR, "defensefinder/defense_finder_mtase_all.tsv"),
BLASTP_path = os.path.join(OUTPUTDIR, "blastp/rebase_mtase_sign_alignment_all.tsv"),
mod_table_path = os.path.join(OUTPUTDIR, "pfam_hmm_hits", f"{BASENAME}_gene_id_mod_table.tsv"),
DefenseFinder_path = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_mtase.tsv"),
BLASTP_path = os.path.join(OUTPUTDIR, "blastp", f"{BASENAME}_rebase_mtase_sign_alignment.tsv"),
nanomotif_table_path = config["NANOMOTIF"],
contig_bin = config["CONTIG_BIN"]
output:
Expand Down
2 changes: 1 addition & 1 deletion nanomotif/mtase_linker/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
_program = "MTase-linker"
__version__ = "0.1.0"
__version__ = "0.2.1"
24 changes: 8 additions & 16 deletions nanomotif/mtase_linker/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,14 @@ def run_MTase_linker(args):
sys.exit(1)

# Bins directory
a1 = os.path.join(cwd, args.binsdir)
a2 = os.path.join(args.binsdir)
if os.path.exists(a1) and os.path.isdir(a1) and glob.glob(os.path.join(a1, '*.fa')):
binsdir = a1
elif os.path.exists(a2) and os.path.isdir(a2) and glob.glob(os.path.join(a2, '*.fa')):
binsdir = a2
elif os.path.exists(a1) and os.path.isdir(a1):
msg = 'Error: directory {} does not contain .fa files '.format(b1)
sys.stderr.write(msg)
sys.exit(1)
elif os.path.exists(a2) and os.path.isdir(a2):
msg = 'Error: directory {} does not contain .fa files '.format(a2)
sys.stderr.write(msg)
sys.exit(1)
a1 = os.path.join(cwd, args.assembly)
a2 = os.path.join(args.assembly)
if os.path.exists(a1):
assembly_path = a1
elif os.path.exists(a2):
assembly_path = a2
else:
msg = 'Error: cannot find bins directory in paths {} or {} '.format(a1,a2)
msg = 'Error: cannot find assembly files in paths {} or {} '.format(a1,a2)
sys.stderr.write(msg)
sys.exit(1)

Expand Down Expand Up @@ -110,7 +102,7 @@ def run_MTase_linker(args):

workflow = None
workflow = {"THREADS": args.threads,
"BINSDIR": binsdir,
"ASSEMBLY": assembly_path,
"CONTIG_BIN": contig_bin,
"OUTPUTDIRECTORY": args.outputdir,
"DEPENDENCY_PATH": dependency_dir,
Expand Down
23 changes: 0 additions & 23 deletions nanomotif/mtase_linker/mtase_linker_config.yaml

This file was deleted.

Loading

0 comments on commit 3325216

Please sign in to comment.