diff --git a/README.md b/README.md index 11791cf..0cc9cd9 100644 --- a/README.md +++ b/README.md @@ -105,11 +105,9 @@ The installation requires conda to generate a few environments. When the additional dependencies are installed you can run the workflow using `MTase-linker run` ``` -nanomotif MTase-linker run -t 10 --binsdir bins_directory --contig_bin contig_bin.tsv --bin_motifs nanomotif/bin_motifs.tsv -d ML_dependencies -o mtase_linker +nanomotif MTase-linker run -t 10 --assembly ASSEMBLY.fasta --contig_bin contig_bin.tsv --bin_motifs nanomotif/bin_motifs.tsv -d ML_dependencies -o mtase_linker ``` -Please be aware that the `--binsdir` flag should be followed by the path leading to the parent directory of the assembly og bin files, and that these files should have a .fa extension. - Running the nanomotif MTase-linker run command will generate two primary output files: mtase_assignment_table.tsv and nanomotif_assignment_table.tsv. The first file lists all predicted MTase genes in the genome along with their predicted methylation characteristics and whether the module was able to unambiguously assign any detected motifs to the MTase (`linked` = (True/False)). The second file includes data from the bin-motifs.tsv of the nanomotif output with two additional columns `linked` and `candidate_genes`. The `linked` variable is a boolean indicator if the motif could be unambiguously linked to a MTase in the bin/genome (TRUE/FALSE). If True the gene_id of the MTase is provided in `candidate_gene`. If False, the `candidate_gene` variable lists feasible candidate facilitators of the modification based on motif type and modification type predictions. diff --git a/nanomotif/_version.py b/nanomotif/_version.py index 3d26edf..df12433 100644 --- a/nanomotif/_version.py +++ b/nanomotif/_version.py @@ -1 +1 @@ -__version__ = "0.4.1" +__version__ = "0.4.2" diff --git a/nanomotif/argparser.py b/nanomotif/argparser.py index 7695fc3..9143b79 100644 --- a/nanomotif/argparser.py +++ b/nanomotif/argparser.py @@ -173,7 +173,7 @@ def create_parser(): parser_mtase_linker_run.add_argument("-t", "--threads", type=int, default=1, help="Number of threads to use. Default is 1") parser_mtase_linker_run.add_argument("--forceall", type=bool, default=False, help="Forcerun workflow regardless of already created output (default = False)") parser_mtase_linker_run.add_argument("--dryrun", type=bool, default=False, help="Dry-run the workflow. Default is False") - parser_mtase_linker_run.add_argument("--binsdir", type=str, required=True, help="Directory with bin files or assembly files. Needs to have the .fa extension.") + parser_mtase_linker_run.add_argument("--assembly", type=str, required=True, help="Path to assembly file.") parser_mtase_linker_run.add_argument("--contig_bin", type=str, required=True, help="tsv file specifying which bin contigs belong.") parser_mtase_linker_run.add_argument("--bin_motifs", type=str, required=True, help="bin-motifs.tsv output from nanomotif.") parser_mtase_linker_run.add_argument("-d", "--dependency_dir", type=str, default=os.path.join(os.getcwd(), "ML_dependencies"), help="Same as for installation. Path to directory of the ML_dependencies directory created during installation of the MTase-linker module. Default is cwd/ML_dependencies") diff --git a/nanomotif/mtase_linker/MTase_linker.smk b/nanomotif/mtase_linker/MTase_linker.smk index 70d8bfc..c761d1c 100644 --- a/nanomotif/mtase_linker/MTase_linker.smk +++ b/nanomotif/mtase_linker/MTase_linker.smk @@ -6,11 +6,13 @@ import os #configfile: "config/mtase_linker_config.yaml" # Loading unique bin names -bin_ids = glob_wildcards(os.path.join(config["BINSDIR"], "{binname}.fa")).binname +#bin_ids = glob_wildcards(os.path.join(config["BINSDIR"], "{binname}.fa")).binname # Defining output directory OUTPUTDIR = config["OUTPUTDIRECTORY"] DEPENDENCYDIR = config["DEPENDENCY_PATH"] +ASSEMBLY_PATH = config["ASSEMBLY"] +BASENAME = os.path.splitext(os.path.basename(ASSEMBLY_PATH))[0] ################################################################################# #MTase-Linker Pipeline @@ -23,17 +25,14 @@ rule all: - - - # Running prodigal to extract protein sequences rule prodigal: input: - os.path.join(config["BINSDIR"], "{binname}.fa") + ASSEMBLY_PATH output: - gene_coords = os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.gff"), - protein_translations = os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.faa") + gene_coords = os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}.gff"), + protein_translations = os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}.faa") threads: config["THREADS"] conda: "envs/prodigal-2.6.3.yaml" @@ -46,9 +45,9 @@ rule prodigal: # Removing asterisk from AA sequence ends in prodigal sequence file rule process_prodigal: input: - os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.faa") + os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}.faa") output: - os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.processed.faa") + os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}_processed.faa") shell: """ sed '/^[^>]/ s/\*$//' {input} > {output} @@ -58,14 +57,14 @@ rule process_prodigal: # Defensefinder to extract MTase genes rule defenseFinder: input: - faa =os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.processed.faa"), + faa =os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}_processed.faa"), #dummy = os.path.join(OUTPUTDIR,"defense-finder_update.done") output: - hmmer_file = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.processed_defense_finder_hmmer.tsv"), - system_file = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.processed_defense_finder_systems.tsv") + hmmer_file = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_hmmer.tsv"), + system_file = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_systems.tsv") params: modelsdir = os.path.join(DEPENDENCYDIR, "df_models"), - outputdir = os.path.join(OUTPUTDIR, "defensefinder/{binname}") + outputdir = os.path.join(OUTPUTDIR, "defensefinder") threads: config["THREADS"] conda: "envs/defensefinder-1.2.0.yaml" @@ -78,12 +77,12 @@ rule defenseFinder: # Extracting MTase protein ids and AA sequences with defensefinder output rule extract_MTase_protein_seqs: input: - hmmer_file = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.processed_defense_finder_hmmer.tsv"), - system_file = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.processed_defense_finder_systems.tsv"), - prodigal_AAs = os.path.join(OUTPUTDIR, "prodigal/{binname}/{binname}.processed.faa") + hmmer_file = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_hmmer.tsv"), + system_file = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_systems.tsv"), + prodigal_AAs = os.path.join(OUTPUTDIR, "prodigal", f"{BASENAME}_processed.faa") output: - DF_MTase_table = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.tsv"), - DF_MTase_AA = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.faa") + DF_MTase_table = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_mtase.tsv"), + DF_MTase_AA = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_mtase.faa") conda: "envs/python_env-3.12.0.yaml" script: @@ -92,35 +91,35 @@ rule extract_MTase_protein_seqs: # Combining DefenseFinder results for all bins. -rule combine_DF_MTase_files: - input: - tsv_files = expand(os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.tsv"), binname=bin_ids), - faa_files = expand(os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.faa"), binname=bin_ids) - output: - tsv_file = os.path.join(OUTPUTDIR, "defensefinder/defense_finder_mtase_all.tsv"), - faa_file = os.path.join(OUTPUTDIR, "defensefinder/defense_finder_mtase_all.faa") - shell: - """ - # Print the header line - echo -e "hit_id\treplicon\thit_pos\thit_sequence_length\tgene_name\ti_eval\thit_score\thit_profile_cov\thit_seq_cov\thit_begin_match\thit_end_match\tRM_system" > {output.tsv_file} - # Loop through files and append their contents minus the first line - for file in {input.tsv_files}; do - tail -n +2 "$file" >> {output.tsv_file} - done - - for file in {input.faa_files}; do - tail -n +1 "$file" >> {output.faa_file} - done - """ +# rule combine_DF_MTase_files: +# input: +# tsv_files = expand(os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.tsv"), binname=bin_ids), +# faa_files = expand(os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.faa"), binname=bin_ids) +# output: +# tsv_file = os.path.join(OUTPUTDIR, "defensefinder/defense_finder_mtase_all.tsv"), +# faa_file = os.path.join(OUTPUTDIR, "defensefinder/defense_finder_mtase_all.faa") +# shell: +# """ +# # Print the header line +# echo -e "hit_id\treplicon\thit_pos\thit_sequence_length\tgene_name\ti_eval\thit_score\thit_profile_cov\thit_seq_cov\thit_begin_match\thit_end_match\tRM_system" > {output.tsv_file} +# # Loop through files and append their contents minus the first line +# for file in {input.tsv_files}; do +# tail -n +2 "$file" >> {output.tsv_file} +# done + +# for file in {input.faa_files}; do +# tail -n +1 "$file" >> {output.faa_file} +# done +# """ # Blasting MTase sequences against REbase rule blastp_REbase: input: - query = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.faa") + query = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_mtase.faa") output: - blast = temp(os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_alignment_tmp.tsv")), - blast_header = os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_alignment.tsv") + blast = temp(os.path.join(OUTPUTDIR, "blastp", f"{BASENAME}_rebase_mtase_alignment_tmp.tsv")), + blast_header = os.path.join(OUTPUTDIR, "blastp", f"{BASENAME}_rebase_mtase_alignment.tsv") params: max_target_seqs=100, db = os.path.join(DEPENDENCYDIR, "REbase_RecSeq_all", "REbase_RecSeq_all") @@ -147,9 +146,9 @@ rule blastp_REbase: # Retrieving significant BLASTP hits for motif guess rule blastp_sign_hits: input: - os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_alignment.tsv") + os.path.join(OUTPUTDIR, "blastp", f"{BASENAME}_rebase_mtase_alignment.tsv") output: - os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_sign_alignment.tsv") + os.path.join(OUTPUTDIR, "blastp", f"{BASENAME}_rebase_mtase_sign_alignment.tsv") params: identity = config["IDENTITY"], qcovs = config["QCOVS"] @@ -161,30 +160,30 @@ rule blastp_sign_hits: # Combining significant BLASTP results for all bins. -rule combine_BLASTP_files: - input: - expand(os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_sign_alignment.tsv"), binname=bin_ids), - output: - os.path.join(OUTPUTDIR, "blastp/rebase_mtase_sign_alignment_all.tsv"), - shell: - """ - # Print the header line - echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tqcovs\tsalltitles\tREbase_id\tmotif" > {output} - # Loop through files and append their contents minus the first line - for file in {input}; do - tail -n +2 "$file" >> {output} - done - """ +# rule combine_BLASTP_files: +# input: +# expand(os.path.join(OUTPUTDIR, "blastp/{binname}/{binname}.rebase_mtase_sign_alignment.tsv"), binname=bin_ids), +# output: +# os.path.join(OUTPUTDIR, "blastp/rebase_mtase_sign_alignment_all.tsv"), +# shell: +# """ +# # Print the header line +# echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tqcovs\tsalltitles\tREbase_id\tmotif" > {output} +# # Loop through files and append their contents minus the first line +# for file in {input}; do +# tail -n +2 "$file" >> {output} +# done +# """ # Mod type predictions rule run_pfam_hmm: input: - DF_MTase_AA = os.path.join(OUTPUTDIR, "defensefinder/{binname}/{binname}.defense_finder_mtase.faa") + DF_MTase_AA = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_mtase.faa") output: - hmmsearch = os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.raw_hmm_hits_mtase_aaseqs.hits"), - hits = os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.hmm_hits_mtase_aaseqs.tsv") + hmmsearch = os.path.join(OUTPUTDIR, "pfam_hmm_hits", f"{BASENAME}_raw_hmm_hits_mtase_aaseqs.hits"), + hits = os.path.join(OUTPUTDIR, "pfam_hmm_hits", f"{BASENAME}_hmm_hits_mtase_aaseqs.tsv") params: hmm_profiles = os.path.join(DEPENDENCYDIR, "PFAM_MTase_profiles.hmm") threads: config["THREADS"] @@ -211,9 +210,9 @@ rule run_pfam_hmm: # Converting pfam hits to mod type predictions: rule mod_type_prediction_table: input: - os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.hmm_hits_mtase_aaseqs.tsv") + os.path.join(OUTPUTDIR, "pfam_hmm_hits", f"{BASENAME}_hmm_hits_mtase_aaseqs.tsv") output: - os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.gene_id_mod_table.tsv") + os.path.join(OUTPUTDIR, "pfam_hmm_hits", f"{BASENAME}_gene_id_mod_table.tsv") conda: "envs/python_env-3.12.0.yaml" script: @@ -222,28 +221,28 @@ rule mod_type_prediction_table: # Combining modtype prediction for all bins. -rule combine_modtype_files: - input: - expand(os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.gene_id_mod_table.tsv"), binname=bin_ids), - output: - os.path.join(OUTPUTDIR, "pfam_hmm_hits/gene_id_mod_table_all.tsv"), - shell: - """ - # Print the header line - echo -e "gene_id\tmod_type" > {output} - # Loop through files and append their contents minus the first line - for file in {input}; do - tail -n +2 "$file" >> {output} - done - """ +# rule combine_modtype_files: +# input: +# expand(os.path.join(OUTPUTDIR, "pfam_hmm_hits/{binname}/{binname}.gene_id_mod_table.tsv"), binname=bin_ids), +# output: +# os.path.join(OUTPUTDIR, "pfam_hmm_hits/gene_id_mod_table_all.tsv"), +# shell: +# """ +# # Print the header line +# echo -e "gene_id\tmod_type" > {output} +# # Loop through files and append their contents minus the first line +# for file in {input}; do +# tail -n +2 "$file" >> {output} +# done +# """ # Generating final output rule motif_assignment: input: - mod_table_path = os.path.join(OUTPUTDIR, "pfam_hmm_hits/gene_id_mod_table_all.tsv"), - DefenseFinder_path = os.path.join(OUTPUTDIR, "defensefinder/defense_finder_mtase_all.tsv"), - BLASTP_path = os.path.join(OUTPUTDIR, "blastp/rebase_mtase_sign_alignment_all.tsv"), + mod_table_path = os.path.join(OUTPUTDIR, "pfam_hmm_hits", f"{BASENAME}_gene_id_mod_table.tsv"), + DefenseFinder_path = os.path.join(OUTPUTDIR, "defensefinder", f"{BASENAME}_processed_defense_finder_mtase.tsv"), + BLASTP_path = os.path.join(OUTPUTDIR, "blastp", f"{BASENAME}_rebase_mtase_sign_alignment.tsv"), nanomotif_table_path = config["NANOMOTIF"], contig_bin = config["CONTIG_BIN"] output: diff --git a/nanomotif/mtase_linker/__init__.py b/nanomotif/mtase_linker/__init__.py index 612feb7..2127e07 100644 --- a/nanomotif/mtase_linker/__init__.py +++ b/nanomotif/mtase_linker/__init__.py @@ -1,2 +1,2 @@ _program = "MTase-linker" -__version__ = "0.1.0" \ No newline at end of file +__version__ = "0.2.1" \ No newline at end of file diff --git a/nanomotif/mtase_linker/command.py b/nanomotif/mtase_linker/command.py index 245e0b3..5cd23e0 100644 --- a/nanomotif/mtase_linker/command.py +++ b/nanomotif/mtase_linker/command.py @@ -19,22 +19,14 @@ def run_MTase_linker(args): sys.exit(1) # Bins directory - a1 = os.path.join(cwd, args.binsdir) - a2 = os.path.join(args.binsdir) - if os.path.exists(a1) and os.path.isdir(a1) and glob.glob(os.path.join(a1, '*.fa')): - binsdir = a1 - elif os.path.exists(a2) and os.path.isdir(a2) and glob.glob(os.path.join(a2, '*.fa')): - binsdir = a2 - elif os.path.exists(a1) and os.path.isdir(a1): - msg = 'Error: directory {} does not contain .fa files '.format(b1) - sys.stderr.write(msg) - sys.exit(1) - elif os.path.exists(a2) and os.path.isdir(a2): - msg = 'Error: directory {} does not contain .fa files '.format(a2) - sys.stderr.write(msg) - sys.exit(1) + a1 = os.path.join(cwd, args.assembly) + a2 = os.path.join(args.assembly) + if os.path.exists(a1): + assembly_path = a1 + elif os.path.exists(a2): + assembly_path = a2 else: - msg = 'Error: cannot find bins directory in paths {} or {} '.format(a1,a2) + msg = 'Error: cannot find assembly files in paths {} or {} '.format(a1,a2) sys.stderr.write(msg) sys.exit(1) @@ -110,7 +102,7 @@ def run_MTase_linker(args): workflow = None workflow = {"THREADS": args.threads, - "BINSDIR": binsdir, + "ASSEMBLY": assembly_path, "CONTIG_BIN": contig_bin, "OUTPUTDIRECTORY": args.outputdir, "DEPENDENCY_PATH": dependency_dir, diff --git a/nanomotif/mtase_linker/mtase_linker_config.yaml b/nanomotif/mtase_linker/mtase_linker_config.yaml deleted file mode 100644 index 2c39528..0000000 --- a/nanomotif/mtase_linker/mtase_linker_config.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# Path to bin directory -BINSDIR: "/home/bio.aau.dk/wy72yf/Projects/restriction-modification-annotation/data/ZymoHMW/" #Bins directory contain.fa files - full path - -# Path to contig_bin_tsv -contig_bin_tsv: "/home/bio.aau.dk/wy72yf/Projects/restriction-modification-annotation/data/ZymoHMW/contig_bin.tsv" - -# output directory -outputdirectory: "/home/bio.aau.dk/wy72yf/Projects/test/analysis/ZymoHMW/mtase_linker_test" - -# REbase database path -REbase_db_path: "/home/bio.aau.dk/wy72yf/Projects/restriction-modification-annotation/data/REbase_RecSeq_all/REbase_RecSeq_all" - -# Sequence identity threshold for motif inference -identity: 80 - -# Query coverage threshold for motif inference -qcovs: 80 - -# PFAM hmm models path -PFAM_HMM_path: "/home/bio.aau.dk/wy72yf/Projects/restriction-modification-annotation/data/MTaseHMMProfiles/combined_MTase_profiles.hmm" - -# Nanomotif bin-motifs.tsv path -nanomotif_table: "/home/bio.aau.dk/lx38ll/dark-science/motif-identification/analysis/ZymoHMW/nanomotif/mmlong2/bin-motifs.tsv" \ No newline at end of file diff --git a/nanomotif/mtase_linker/src/mod_typing.py b/nanomotif/mtase_linker/src/mod_typing.py index fd83cc1..9a62719 100644 --- a/nanomotif/mtase_linker/src/mod_typing.py +++ b/nanomotif/mtase_linker/src/mod_typing.py @@ -4,42 +4,6 @@ import os from utillities import mod_predictions_hmm - -#%% Function to decide the mod type of the MTase protein sequences. -def select_and_filter_pfam_entries(pfam_hit_mod_df): - """ - Based on a user defined hierachy, this function decides whether the protein belong to modtype "a" or "m" or None - - Parameters: - pfam_hit_mod_df: pfam_hit_table with mod type prediction for each entry - """ - - # Define hierarchies - hierarchy_a = ["PF02086", "PF05869", "PF02384", "PF01555", "PF1216", "PF07669", "PF13651"] - hierarchy_m = ["PF0014"] - - # Function to select modtype based on hierarchy - def select_row_based_on_hierarchy(name, group, hierarchy_a, hierarchy_m): - group_a = group[group['mod_type'] == 'a'].copy() - group_m = group[group['mod_type'] == 'm'].copy() - - if not group_a.empty and not group_m.empty: - return pd.DataFrame({'gene_id': [name], 'mod_type': ['ambiguous']}) - elif not group_a.empty: - group = group_a - hierarchy = hierarchy_a - elif not group_m.empty: - group = group_m - hierarchy = hierarchy_m - else: - return group.head(1) - - group['rank'] = group['HMM_acc'].apply(lambda x: hierarchy.index(x.split('.')[0]) if x.split('.')[0] in hierarchy else len(hierarchy)) - return group.sort_values('rank').head(1) - - # Iterate over the groups in the dataframe - return pd.concat([select_row_based_on_hierarchy(name, group, hierarchy_a, hierarchy_m) for name, group in pfam_hit_mod_df.groupby('gene_id')]) - #%% Defining pfam hit table ath pfam_hit_tsv_path = snakemake.input[0] @@ -49,13 +13,31 @@ def select_row_based_on_hierarchy(name, group, hierarchy_a, hierarchy_m): pfam_hit_df = pd.read_csv(pfam_hit_tsv_path, sep = '\t', header = None) pfam_hit_df = pfam_hit_df.loc[:, 0:3] pfam_hit_df.columns = ["gene_id", "acc", "HMM_name", "HMM_acc"] - +#%% #Adding modtype prediction to pfam hits pfam_hit_mod_df = mod_predictions_hmm(pfam_hit_df, 'HMM_acc') - # Running above function to extract gene name and modtype table - protein_acc_mod_df = select_and_filter_pfam_entries(pfam_hit_mod_df)[["gene_id", "mod_type"]] + #Function to concatenate and remove duplicate letters + def process_mod_type(group): + # Concatenate all the mod types with comma separation + mod_types_concatenated = ', '.join(group['mod_type']) + + # Split the concatenated string into a set to remove duplicates, then sort it + mod_types_condensed = ', '.join(sorted(set(mod_types_concatenated.split(', ')))) + + # Check if both 'm' and 'a' are present, if so, choose 'm' only + if 'm' in mod_types_condensed and 'ac' in mod_types_condensed: + mod_types_condensed = mod_types_condensed.replace('ac, ', '').replace(', ac', '').replace('ac', '') + + # Return the processed mod types + return mod_types_condensed + + # Group by 'gene_id' and apply the function + protein_acc_mod_df = pfam_hit_mod_df.groupby('gene_id').apply(process_mod_type).reset_index() + protein_acc_mod_df.columns = ['gene_id', 'mod_type'] +#%% protein_acc_mod_df.to_csv(snakemake.output[0] , sep='\t', index=False) else: open(snakemake.output[0], 'w').close() + diff --git a/nanomotif/mtase_linker/src/motif_assignment.py b/nanomotif/mtase_linker/src/motif_assignment.py index 7a7aaf8..d2232d4 100644 --- a/nanomotif/mtase_linker/src/motif_assignment.py +++ b/nanomotif/mtase_linker/src/motif_assignment.py @@ -7,6 +7,7 @@ from utillities import reverse_complement from utillities import motif_type_predictions from utillities import RM_type_converter +from utillities import recode_mod_type #%% Importing modtype table gene_mod_tsv_path = snakemake.input['mod_table_path'] @@ -32,12 +33,14 @@ #%% Importing nanomotif table nanomotif_table_path = snakemake.input['nanomotif_table_path'] nanomotif_table = pd.read_csv(nanomotif_table_path, sep = "\t", header = 0) +nanomotif_table['assign_mod_type'] = nanomotif_table['mod_type'].apply(recode_mod_type) #Set motif acceptance threshold as >=0.5 mean_methylation and remove ambiguous motifs in nanomotif_table mean_methylation = nanomotif_table['n_mod_bin'] / (nanomotif_table['n_mod_bin'] + nanomotif_table['n_nomod_bin']) nanomotif_table_mm50 = nanomotif_table[mean_methylation >= 0.5] + #%% Importing bin_contig table from mmlong2 contig_bin_tsv_path = snakemake.input['contig_bin'] contig_bin_df = pd.read_csv(contig_bin_tsv_path , sep = "\t", header = None) @@ -60,7 +63,7 @@ #%% Add "a" mod type known for RM type I, type IIG and type III for idx, row in MTase_table_assigned.iterrows(): if row['sub_type'] == 'Type_I' or row['sub_type'] == 'Type_III' or row['sub_type'] == 'Type_IIG': - row['mod_type'] = "a" + row['mod_type'] = "ac" #%% Preparing dataframes @@ -73,15 +76,15 @@ for index, entry in nan_genes.iterrows(): # Create a copy of the current nan gene for 'a' and 'm' - new_entry_a = entry.copy() + new_entry_ac = entry.copy() new_entry_m = entry.copy() - # Set 'mod_type' to 'a' and 'm' respectively - new_entry_a['mod_type'] = 'a' + # Set 'mod_type' to 'ac' and 'm' respectively + new_entry_ac['mod_type'] = 'ac' new_entry_m['mod_type'] = 'm' # Append entries to extra_rows - extra_entries.append(new_entry_a) + extra_entries.append(new_entry_ac) extra_entries.append(new_entry_m) extra_entries_df = pd.DataFrame(extra_entries) @@ -99,15 +102,15 @@ #Group by 'bin name' and 'mod type' Cgrouped_MTase = combined_MTase_df.groupby(['bin name', 'mod_type'], dropna=True) -grouped_nanomotif = nanomotif_table_mm50.groupby(['bin', 'mod_type']) +grouped_nanomotif = nanomotif_table_mm50.groupby(['bin', 'assign_mod_type']) #%% Assigment rule system # Step 1: Priority 1 Assignment based on similarity to motif guess -for (bin_name, mod_type), nanomotif_group in grouped_nanomotif: - if (bin_name, mod_type) in Cgrouped_MTase.groups: - mtase_group = Cgrouped_MTase.get_group((bin_name, mod_type)) +for (bin_name, assign_mod_type), nanomotif_group in grouped_nanomotif: + if (bin_name, assign_mod_type) in Cgrouped_MTase.groups: + mtase_group = Cgrouped_MTase.get_group((bin_name, assign_mod_type)) for idx, row in nanomotif_group.iterrows(): # Calculate reverse complement of the motif rev_comp_motif = reverse_complement(row['motif']) @@ -142,7 +145,7 @@ #%% Re-group to exclude assigned entries -grouped_nanomotif = nanomotif_table_mm50[nanomotif_table_mm50['linked'] != True].groupby(['bin', 'mod_type']) +grouped_nanomotif = nanomotif_table_mm50[nanomotif_table_mm50['linked'] != True].groupby(['bin', 'assign_mod_type']) idx_MTase_table_assigned = MTase_table_assigned[MTase_table_assigned['linked'] != True].index #Extract indicies of non assigned entries #%% combined_MTase_df = combined_MTase_df.loc[idx_MTase_table_assigned] #FIlter combined_MTase based on extracted indicies @@ -150,9 +153,9 @@ #%% # Step 2: Priority 2 Assignment: based single subtypes in group -for (bin_name, mod_type), nanomotif_group in grouped_nanomotif: - if (bin_name, mod_type) in Cgrouped_MTase.groups: - mtase_group = Cgrouped_MTase.get_group((bin_name, mod_type)) +for (bin_name, assign_mod_type), nanomotif_group in grouped_nanomotif: + if (bin_name, assign_mod_type) in Cgrouped_MTase.groups: + mtase_group = Cgrouped_MTase.get_group((bin_name, assign_mod_type)) # Count unique motif types in nanomotif and MTase table for idx, row in nanomotif_group.iterrows(): @@ -186,15 +189,15 @@ MTase_table_assigned['detected_motif'] = MTase_table_assigned['detected_motif'].replace('nan', np.nan) #%% Re-group to exclude assigned entries -grouped_nanomotif = nanomotif_table_mm50[nanomotif_table_mm50['linked'] != True].groupby(['bin', 'mod_type']) +grouped_nanomotif = nanomotif_table_mm50[nanomotif_table_mm50['linked'] != True].groupby(['bin', 'assign_mod_type']) idx_MTase_table_assigned = MTase_table_assigned[MTase_table_assigned['linked'] != True].index #Extract indicies of non assigned entries combined_MTase_df = combined_MTase_df.loc[idx_MTase_table_assigned] #Filter combined_MTase based on extracted indicies Cgrouped_MTase = combined_MTase_df.groupby(['bin name', 'mod_type'], dropna=True) #%% Step 3: Assign candidate genes -for (bin_name, mod_type), nanomotif_group in grouped_nanomotif: - if (bin_name, mod_type) in Cgrouped_MTase.groups: - mtase_group = Cgrouped_MTase.get_group((bin_name, mod_type)) +for (bin_name, assign_mod_type), nanomotif_group in grouped_nanomotif: + if (bin_name, assign_mod_type) in Cgrouped_MTase.groups: + mtase_group = Cgrouped_MTase.get_group((bin_name, assign_mod_type)) # Count unique motif types in nanomotif and MTase table for idx, row in nanomotif_group.iterrows(): @@ -233,8 +236,11 @@ nanomotif_table_mm50.loc[idx, 'candidate_genes'] = genes_str # %% MTase_table_assigned = MTase_table_assigned[['bin name', 'gene_id', 'contig', 'mod_type', 'sub_type', 'RM_system', 'motif_type', 'REbase_ID', 'motif_guess', 'linked', 'detected_motif']] -MTase_table_assigned.columns = ['bin', 'gene_id', 'contig', 'mod_type', 'sub_type', 'RM_system', 'motif_type', 'REbase_ID', 'motif_pred', 'linked', 'detected_motif'] +MTase_table_assigned.columns = ['bin', 'gene_id', 'contig', 'mod_type_pred', 'sub_type', 'RM_system', 'motif_type_pred', 'REbase_ID', 'motif_pred', 'linked', 'detected_motif'] +MTase_table_assigned_cl = MTase_table_assigned.dropna(subset=['bin']) + +nanomotif_table_mm50 = nanomotif_table_mm50[['bin', 'mod_type', 'motif', 'mod_position', 'n_mod_bin', 'n_nomod_bin', 'motif_type', 'motif_complement', 'mod_position_complement', 'n_mod_complement', 'n_nomod_complement', 'linked', 'candidate_genes']] #%% -MTase_table_assigned.to_csv(snakemake.output['MTase_assignment_table'] , sep='\t', index=False) +MTase_table_assigned_cl.to_csv(snakemake.output['MTase_assignment_table'] , sep='\t', index=False) nanomotif_table_mm50.to_csv(snakemake.output['nanomotif_assignment_table'] , sep='\t', index=False) diff --git a/nanomotif/mtase_linker/src/utillities.py b/nanomotif/mtase_linker/src/utillities.py index d59ea82..70d601c 100644 --- a/nanomotif/mtase_linker/src/utillities.py +++ b/nanomotif/mtase_linker/src/utillities.py @@ -144,12 +144,13 @@ def mod_predictions_hmm(df, pfam_hmm_acc_column): Returns: - Return the df with the new mod type column added. """ + a_interpro_acc = ["PF01555", "PF02384", "PF12161", "PF05869", "PF02086", "PF07669", "PF13651"] m_interpro_acc = ["PF00145"] def classify_mod(x): if x.split('.')[0] in a_interpro_acc: - return "a" + return "ac" elif x.split('.')[0] in m_interpro_acc: return "m" else: @@ -159,3 +160,11 @@ def classify_mod(x): return df # %% +#Function to recode nanomotif bin-motifs modtype values. +def recode_mod_type(value): + if value == 'm': + return 'm' + elif value == 'a' or value == '21839': + return 'ac' + else: + return value \ No newline at end of file diff --git a/setup.py b/setup.py index 3efd4e4..42e5bf3 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ long_description=open('README.md').read(), long_description_content_type='text/markdown', include_package_data=True, - package_data={'nanomotif': ['datasets/*']}, + package_data={'nanomotif': ['datasets/*'], 'snakemake':['.smk']}, zip_safe=False, install_requires=[ "wheel",