-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathRunBlastN.sh
executable file
·104 lines (75 loc) · 3.26 KB
/
RunBlastN.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Curated Hepatitis B Virus Alignments from GenBank Data
# Copyright (C) 2017 University of the Witwatersrand, Johannesburg, South Africa
# Author: Dr Trevor G. Bell, [email protected]
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
# This file is part of the pipeline described in the following publication:
# Bell, T. G., Yousif, M. and Kramvis, A. 2016
# Bioinformatic curation and alignment of genotyped hepatitis B virus (HBV)
# sequence data from the GenBank public database.
# SpringerPlus 5: 1896.
# https://springerplus.springeropen.com/articles/10.1186/s40064-016-3312-0
#!/bin/bash
GENO=$1
THRESHOLDSTART=$2
THRESHOLDEND=$3
LENGTH=$4
if [ -z "$GENO" ] || [ -z "$THRESHOLDSTART" ] || [ -z "$THRESHOLDEND" ] || [ -z "$LENGTH" ] ; then
echo "Specify genotype, cut-off start, cut-off end and length."
else
rm -v "$GENO"_log.txt
touch "$GENO"_log.txt
rm -v HBV_"$GENO"*
rm -v "$GENO"*clean*
rm -v "$GENO"*placed*
rm -v "$GENO"*merge*
rm -v "$GENO"*.csv
rm -v "$GENO".xml
echo "Extracting reference sequence..."
python getRef.py "$GENO"_full.fasta "$LENGTH" > HBV_"$GENO"
ret=$?
if [ $ret -ne 0 ] ; then
echo "Error with reference sequence length."
exit $?
fi
echo "Generating BLAST reference library..."
formatdb -i HBV_"$GENO" -p F -t HBV_"$GENO"
echo "BLASTing..."
blastn -out "$GENO".xml -outfmt 5 -query "$GENO".fasta -db HBV_"$GENO" -evalue 0.001 -max_target_seqs 5 -gapopen 5 -gapextend 5
echo "Parsing..."
python parseXML2b.py "$GENO".xml "$LENGTH" > "$GENO"placed.fasta
# Loop from here; replace the "sorted" file each time, as it is modified
for l in `seq $THRESHOLDSTART 1 $THRESHOLDEND` ;
do
TEMPFILE=$GENO$l
echo "Threshold $l..."
echo "Sorting..."
python FASTAsort.py "$GENO"placed.fasta > "$TEMPFILE"placedSorted.fasta
cp "$TEMPFILE"placedSorted.fasta "$TEMPFILE"placedSortedBackup.fasta
echo "Analyzing consensus..."
python splitCons.py "$TEMPFILE"placedSorted.fasta "$l" "$LENGTH" > "$TEMPFILE".csv
echo "Aligning..."
for i in _*.fasta ; do needle -asequence $i -bsequence "$GENO"_cons.fasta -aformat fasta -outfile n$i -gapopen 10 -gapextend 0.5 ; done
echo "Parsing consensus..."
python parseCons.py "$GENO" "$l" > "$TEMPFILE"mergeBack.fasta
echo "Building alignment..."
cat "$TEMPFILE"placedSorted.fasta "$TEMPFILE"mergeBack.fasta > "$TEMPFILE"clean.fasta
python FASTAsort.py "$TEMPFILE"clean.fasta > "$TEMPFILE"cleanSorted.fasta
echo "Checking alignment..."
python CheckFinalAlignment.py "$TEMPFILE"cleanSorted.fasta "$LENGTH" >> CheckFinalAlignment.txt
echo "Done."
rm -v _*.fasta
rm -v n_*.fasta
rm -v "$GENO"_cons.fasta
done
fi
# Ends