bioenvada main

Bio2Byte · Apr 19, 2024 · 69a3201 · 69a3201
1 parent 6bc7ec3
commit 69a3201
Show file tree

Hide file tree

Showing 29 changed files with 4,600 additions and 0 deletions.
diff --git a/BioEnvAda_poster.pdf b/BioEnvAda_poster.pdf
diff --git a/BioEnvAda_scheme.png b/BioEnvAda_scheme.png
diff --git a/README.md b/README.md
@@ -0,0 +1,96 @@
+# BioEnvAda
+
+The analysis of protein evolution requires many steps and tools, starting from collecting DNA data to predicting protein structure. 
+We developed a NextFlow (BioEnvAda) pipeline to investigate protein adaptation to changing environmental conditions. It considers multiple aspects of protein evolution comparing changes in amino acid sequences while considering both phylogenetic information and measures of evolutionary pressure. It calculates tendencies for specific biophysical behaviours accounting for the local sequence environments and incorporates predicted 3D structures of a protein.
+
+
+
+## Quickstart 
+
+The default for all parameters in BioEnvAda is false. If you want to use a predictor, add the flag to the command line to turn it on.
+
+
+Usage in commad line:
+
+ nextflow run pipeline.nf \
+    -profile standard,withdocker \
+    --targetSequences ../input_example.fasta \
+    --type 'aa' or 'nuc' \
+    --qc \ 
+    --clustering 0.85\
+    --relabel \
+    --alignSequences \
+    --efoldmine \
+    --disomine \
+    --agmata \
+    --fetchStructures \
+    --buildTreeEvo \   
+    --outGroup 'Species name to root your tree on' \
+    --csubst \
+    --branchIds '1,2,3'\
+    --eteEvol 'M7,M8' \
+    --selectedProteins 'your,proteins,as,str' \
+    --plotBiophysicalFeatures \
+    --buildLogo \
+    --plotTree \
+	-resume
+
+Alternatively, adapt launch file run_nf.sh
+
+The launch file also provides an extensive log file with the execution hash (e.g. 29f47dd0-59d1-4ca5-a602-00e10b693b31)to resume past jobs.
+Add -resume to restart the last job or -resume execution_hash to restart any older job.
+This can be used to restart jobs that crashed, but also to create plots with different highlighted proteins or different selected branches for csubst, without the need to recalculate all other steps.
+
+
+
+
+![Demonstration of the BioEnvAda workflow and results!](BioEnvAda_scheme.png "Demonstration of the BioEnvAda workflow and results")
+
+## List of parameters
+
+### INPUT FILES
+
+- Input file: 		--targetSequences path/to/data/file
+- Input file type: 	--type  nuc  
+    - NOTE: For input of amino acid sequences use 'aa'
+
+### FILTERING
+
+- Set minimal ooccupancy of position in MSA: 		--qc 
+    - NOTE:
+    - --qc to remove empty columns in alignment
+	- --qc 0.85 to set minial occupancy                        
+- Clustering with CD-Hit: 						--clustering 1
+	- NOTE: 
+    - --clustering to remove duplicate sequences
+	- --cluster 0.85 to set similarity cutoff 
+- Adapt labels to clustering:						--relabel
+
+### PREDICTORS
+
+- Align sequences --alignSequences true
+	- NOTE: 	
+    - –-type aa: residue based MSA with Clustal
+	- --type nuc: nucleotide based MSA with MACSE 
+	- remove flag to keep pre-aligned file 
+
+- DynaMine : ALWAYS
+- DisoMine :					--disomine
+- EFoldMine :					--efoldmine
+- AgMata :					--agmata
+
+Fetch structures using ESM Atlas (--fetchStructures): false
+
+- Phylo. Tree :				--buildTreeEvo
+- Species name to root your tree on :	--outGroup partialSpeciesID
+- Csubst :					--csubst
+- CsubstSite :				--branchIds 1,5
+- EteEvol :					--eteEvol M7,M8
+
+### OUTPUT FILES
+
+
+- Proteins to be highlighted in the plots: --selectedProteins AncNode14,Syn_BIOS_U3
+- Plot B2btools :							 --plotBiophysicalFeatures  
+- Logo :									 --buildLogo
+- Phylo. Tree plot :						 --plotTree
diff --git a/bin/EteEvol.py b/bin/EteEvol.py
@@ -0,0 +1,69 @@
+#!/usr/bin/python3
+
+import os
+import sys
+from ete3 import EvolTree #, Tree, TreeStyle, TextFace
+from ete3.treeview.layouts import evol_clean_layout
+
+os.mkdir("pamlwd")
+os.mkdir("plots")
+
+name = sys.argv[1]
+tree_file_path = sys.argv[2]
+alignment_file_path = sys.argv[3]
+models=sys.argv[4]
+
+#evol_models = ('M0',"M1","M2") #,'fb' runs forever??
+
+evol_models = models.split(',')
+
+def mylayout(node):
+        if node.is_leaf():
+                node.img_style["size"] = 8
+                node.img_style["shape"] = "circle"
+
+
+def tree_plot(tree, model):
+        #ts = TreeStyle()
+
+        # ts.scale = 100
+        # ts.title.add_face(TextFace(name, fsize=20), column=0)
+        # ts.layout_fn = mylayout
+
+        modname = model.replace(".", "_")
+        hist=model.split(".",1)
+
+        image_name = modname+"_dnds.pdf"
+        plot_filename = os.path.join("plots", image_name)
+
+        #tree.render(plot_filename, w=18000, tree_style=ts, layout=evol_clean_layout, histfaces=[model])
+        if hist[0]=="fb":
+            tree.render(plot_filename, layout=evol_clean_layout)
+        elif hist[0]=="M0":
+            tree.render(plot_filename, layout=evol_clean_layout)
+        else:
+            # tree.render(plot_filename,tree_style=ts, layout=evol_clean_layout)
+            tree.render(plot_filename,layout=evol_clean_layout, histfaces=[model])
+        print("Plot EXECUTED WITH SUCCESS: "+ plot_filename)
+
+
+def dnds_ete3(tree_file_path, alignment_file_path, mod):
+        tree = EvolTree(tree_file_path, format=1)
+        tree.link_to_alignment(alignment_file_path)
+        print('alignment linked')
+
+        tree.workdir =  'pamlwd'
+
+        #run model
+        tree.run_model(mod)
+        print("model " +mod+ " done")
+
+        #wdir=output_directory+'/'+mod+'/out'
+        #tree.link_to_evol_model(wdir,evmodel)
+
+        tree_plot(tree, mod)
+
+
+for evol_model in evol_models:
+    model_name = evol_model+'.'+name
+    dnds_ete3(tree_file_path, alignment_file_path, model_name)
diff --git a/bin/MsaChecker.py b/bin/MsaChecker.py
@@ -0,0 +1,114 @@
+#!/usr/local/bin/python
+import pandas as pd
+import sys
+
+msa = sys.argv[1] #'work/4a/2282f5919fc18184b19af59b29334a/CK_00001561_null_filtered.fasta.anuc'  #'bact_DA_SP_MSA_clustal.fasta' #sys.argv[1]
+buildTreeEvo =  sys.argv[2] #True# sys.argv[2]
+drop_empty = sys.argv[3]
+
+made_new_file = False
+
+with open(msa, 'r') as f:
+    lines = f.readlines()
+
+sequences_dict = {}
+name = 'a'
+sequence = 'a'
+for line in lines:
+    if line.startswith('>'):
+        sequences_dict[name] = sequence
+        name = line.strip('>\n')
+        sequence = ''
+    else:
+        sequence += line.strip()
+
+
+sequences_dict[name] = sequence
+sequences_dict.pop('a')
+print (sequences_dict)
+#alignment_file = pd.read_csv(msa, header=None)
+#seq_df = pd.DataFrame(alignment_file.iloc[1::2].values , columns=['seq'])
+
+msa_df = pd.DataFrame.from_dict(sequences_dict, orient='index',  columns=['seq'])
+
+seq_df = pd.DataFrame(msa_df.seq.apply(list).tolist())
+
+
+#check if seqs have all the same length
+if seq_df.isnull().values.any() == True:
+    raise ValueError("Sequences do not have all the same length, is this an MSA?")
+
+
+#remove columns without minimal occupancy 
+SEQUENCES_COUNT, RESIDUES_COUNT = seq_df.shape
+OCCUPANCY = 1 - (seq_df == '-').sum() / SEQUENCES_COUNT
+
+only_gap = []
+cols = seq_df.columns
+
+if drop_empty == 'false':
+    print ("No occupancy check performed.")
+else:
+    for i in range (0,RESIDUES_COUNT):
+        if drop_empty == 'true' :
+            if OCCUPANCY[i]== 0:
+                only_gap.append(cols[i])
+        else:
+            if OCCUPANCY[i] < float(drop_empty):
+                only_gap.append(cols[i])
+    seq_df.drop(labels = only_gap, axis =1, inplace =True)
+    if only_gap!=[]:
+        print ("Low occupancy column in MSA! Columns %s were removed and new file was created." %(only_gap))
+        made_new_file =True 
+
+
+
+#remove stopcodons for BuildTreeEvol
+found_stopcodons=False
+if buildTreeEvo == 'true':
+    stopcodons = ['TAG','TAA','TGA','tag','taa','tga']
+    lastcodon_df = seq_df.iloc[:, -3:]
+
+    for row in range(0, SEQUENCES_COUNT):
+        if found_stopcodons == True:
+            break
+
+        codon= lastcodon_df.iloc[row].to_string(header=False, index=False).replace('\n','').replace(' ','')
+        if codon in stopcodons:
+            seq_df = seq_df.iloc[:, :-3]
+            print("Stop codons were found in alignment in sequence %s. This causes problems for iqtree and therefore the last 3 nucleotides have been removed."%(row))
+            made_new_file = True 
+            found_stopcodons = True
+
+
+#remove stars
+seq_df.iloc[:,-1] = seq_df.iloc[:,-1].str.replace('*', '')
+
+
+#add headers
+#seq_ids = pd.DataFrame(alignment_file.iloc[::2].values, columns=['index'] )
+
+msa_df=msa_df.reset_index()
+seqid_df = msa_df['index']
+seq_df = pd.concat([seqid_df,seq_df], axis=1)
+
+print (seq_df)
+
+#create_short_labels
+#seq_df['index'] = seq_df['index'].str.split('_CK', n=1, expand=True)[0]
+
+
+print (seq_df)
+#prep and write outfile
+seq_df['index']= ">" + seq_df['index'] + '\n'
+rows = seq_df.to_string(header=False,index=False,index_names=False).split('\n')
+
+file_extension = len(msa.split('.')[-1]) +1
+
+out_name = msa[: -file_extension]+'_checked.' + msa.split('.')[-1]
+
+with open (out_name, 'w') as m:
+    for row in rows:
+        row = row.replace('\\n','\n').replace(' ','')
+        m.write( row + '\n')
+        print (row)