-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbhootkal.sh
executable file
·124 lines (94 loc) · 5.46 KB
/
bhootkal.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/bin/bash
# Predict ancestral gene/protein # Author: Jitendra Narayan
# USAGE: ./bhootkal.sh
# Following pre-installed software and tools need to be in yout path
#-Python2 #Perl5
#-clustalo -for mutiple alingments # can be replaced with IQ-TREE http://www.iqtree.org/
#-phyml #Planning to replace it with MACSE https://bioweb.supagro.inra.fr/macse/index.php?menu=releases which is take care of frameshift
#-codeml #seems works well
#Perl and Python script location
#inFile should contain all fasta sequences
#scriptLoc=/home/jitendra/Desktop/testAncestralReco/scriptBase
#inputLoc=/home/jitendra/Desktop/testAncestralReco/inFile
# location to getopts.sh file
source scriptBase/getopt.sh
USAGE="-o OUT -d DIR -f FASTA -c CTRL -j JONES[ -a START_DATE_TIME ]"
parse_options "${USAGE}" ${@}
#Sequence alignment preparation.
#Make a multiple alignment, either with Mafft-L-INS-i or Clustal-Omega:
#There is possibilities to try using IQ-TREE.
#IQ-TREE: A fast and effective stochastic algorithm to infer phylogenetic trees by maximum likelihood. IQ-TREE compares favorably to RAxML and PhyML in terms of likelihoods with similar computing time
#You can opt any other you like, but keep an eye on formating
#mafft-linsi ${DIR}/${FASTA} > TargetSequences.out.fasta
#Remove the directory if it's present, otherwise do nothing.
rm -rf ${OUT} ?
#Remove special charater from file
#mostly needed when you have your own formated sequences
sed 's,|,_,g' -i ${DIR}/${FASTA} #fasta #header #sed
#Multiple alignment of all sequences
#Here I prefer clastal omega for alingments - because it is installed by default in my Os
# Will update it to use IQ-TREE in future
clustalo --in ${DIR}/${FASTA} --out TargetSequences.out.fasta
#It is always recommended to check by eye, To view the alignments in jalview
#jalview TargetSequences.out.fasta
#Formating is a mess, why not one format. We need to format the out file for next run.
#The format of the resulting alignment is FASTA. However, most phylogenetic softwares use PHYLIP format. So, you have to convert it into PHYLIP format.
#Script adapted from online source @ :) thanks for open source community
python scriptBase/convert_fasta2phylip.py TargetSequences.out.fasta TargetSequences.out.phy
#As I say, it is upto you to choose
#generate a tree, either with PhyML (one of the most accurate tool) or FastTree (very fast and pretty accurate):
# or MACSE (relatively new and accurate) https://bioweb.supagro.inra.fr/macse/index.php?menu=releases
#FastTree -nosupport TargetSequences.out.phy > TargetSequences.out.tree
#NOTE: -nosupport:(we don't want boostrap, as this will cause trouble for further analyses in CodeML).
#Try running phyml -h for all the options
phyml -i TargetSequences.out.phy -d aa -m JTT -c 4 -a e -b 0
#-i = input file
#-d aa: amino acid sequences
#-m JTT: (substitution matrix). JTT works fine for most proteins, but other matrices (WAG, LG) can do slightly better.
#-c 4: (numbers of categories for the gamma distribution)
#-a e: (estimate alpha parameter for the gamma distribution)
#-b 0: (we don't want boostrap, as this will cause trouble for further analyses in CodeML).
#DNA interleaved sequence file, default parameters : ./phyml -i seqs1
#AA interleaved sequence file, default parameters : ./phyml -i seqs2 -d aa
#AA sequential sequence file, with customization : ./phyml -i seqs3 -q -d aa -m JTT -c 4 -a e
#Move the tree in a file
cp TargetSequences.out.phy_phyml_tree TargetSequences.out.tree
#Visual confirm the tree
#View the tree in NJPlot or FigTree
#njplot
#Create a control file for ancestral reconstruction "${CTRL}" ### SEE ALL THE PARAMETERS THERE N CHANGE ACCORDINGLY ###
#Explanation of some parameters:
#runmode = 0 => We provide the tree.
#clock = 0 => We don't set a molecular clock. We assume that the genes are evolving at different rate.
#aaDist = 0 => We don't use the physicochemical properties of the amino acid.
#aaRatefile = ./jones.dat => We use the JTT matrix. Other matrix could be used (WAG, etc...)
#model = 2 => We use an empirical model (= substitutions matrix such as JTT).
#fix_alpha = 0 => We estimated the alpha parameter of the gamma distribution.
#alpha = 0.5 => We start the estimation from 0.5
#RateAncestor = 1 => Force the estimation of ancestral states.
#cleandata = 0 => Keep all ambigous data ("-", "X").
#You may have to copy the file "jones.dat" from the dat folder in the PAML package, or indicate its location.
#In BioLinux it is located at /usr/share/paml/dat/jones.dat
cp ${JONES} .
codeml ${CTRL}
#Read more at https://petrov.stanford.edu/software/src/paml3.15/doc/pamlDOC.pdf
#CodeML will also write into many files, but only two are of interest here:
#TargetSequences.out.mlc => Contains many information on evolutionary rates.
#rst => Contains ancestral states for sites and for nodes.
#Extract the sequence in fasta format
python scriptBase/parse_rst.py rst > ancestral_sequences.fasta
#Compute physico-chemical properties on ancestral sequences.
#Python scripts are adapted from open source community
#python scriptBase/compute_pI.py ${DIR}/${FASTA}
#python scriptBase/compute_pI.py ancestral_sequences.fasta
#Map properties on tree.
#python scriptBase/map_on_tree.py ancestral_sequences.fasta TargetSequences.out.tree > TargetSequences.out_annotated_pI.tree
mkdir ${OUT}
#find . -maxdepth 1 \( ! -type d \) -exec sh -c 'mv "$@"' OUT} _ {} \;
for file in *; do
if ! [ -d "$file" ]; then
mv -- "$file" "${OUT}"/
fi
done
cp ${OUT}/*.ctl ${OUT}/*.sh .
#You can estimating the stability effect of a mutation with FoldX