-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNCR_ampliconbasedngs.sh
300 lines (245 loc) · 13.2 KB
/
NCR_ampliconbasedngs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
#### Script generated to analyse data from NCR-mtDNA for ancient samples obtained by amplicon-based NGS methods ####
## time bash NCR_ampliconbasedngs.sh 2>&1 | tee NCR_ampliconbasedngs.log
## Chose the following values
# Minimum depth coverage
depthcov=10
# Minimum allele frequency to consider a mixture
freq=0.3
# Minimum amount of alternative alleles to be included
minaltcount=3
if ls *.fastq.gz 1> /dev/null 2>&1; then
for fastq_file in *.fastq.gz; do
sample="${fastq_file//_R*/}"
if [[ -d ${sample} ]]; then
mv ${fastq_file} ${sample}
else
mkdir ${sample}
mv ${fastq_file} ${sample}
fi
done
fi
source activate env_ncrngsamplicons
main_dir=$(pwd)
#A directory to save the files generated is made
vis=visualization
if [[ -d ${vis} ]]; then
echo "Directory ${vis} already exists"
else
mkdir ${vis}
fi
#A directory to save the reference is made
ref=reference
seq_ref=rCRS_NCR_lineal.fasta
if [[ -d ${ref} ]]; then
echo "Directory ${ref} already exist, it is not going to be created again"
NCR=${main_dir}/${ref}/${seq_ref}
else
echo "A directory is created to keep the reference sequence and its indexes"
mkdir ${ref}
mv ${seq_ref} ${ref}
fi
cd ${ref}
#Indexes would be generated only the first time.
if [[ -e ${seq_ref}.amb ]] && [[ -e ${seq_ref}.fai ]] ; then
echo "
Indexes were already created
"
else
echo "
Indexes are being created
"
time bwa index ${seq_ref}
time samtools faidx ${seq_ref}
picard CreateSequenceDictionary R=${seq_ref} O=rCRS_NCR_lineal.dict
fi
cd ${main_dir}
NCR=${ref}/${seq_ref}
cd ${main_dir}
NCR=${main_dir}/${ref}/${seq_ref}
#A list with all the directories that have samples is made
for samples in *; do if [[ -d ${samples} ]] && [[ ${samples} != ${vis} ]] && [[ ${samples} != ${ref} ]]; then echo ${samples}; fi done > samples.txt
echo -e "Sample\tPosition\tAlternative_base\tN_total\tN_alternative\tPercentage" > base_mix_depthcov-${depthcov}.txt
while read sample; do
echo "
Starting with sample ${sample}
"
cd "${sample}"
sample_dir=$(pwd)
#Quality is checked with a FastQC
fastq_file=1_fastqc_file_quality_pre-trimming
if [[ -d ${fastq_file} ]]; then
echo "Directory ${fastq_file} already exists"
else
mkdir ${fastq_file}
fastqc -o ${fastq_file} -f fastq *fastq.gz
fi
#First fastp. To trim adapters and remove duplicates
fastp --dedup \
--detect_adapter_for_pe \
--trim_poly_x \
--length_required 30 \
--thread 8 \
--json ${sample}_temp.json \
--out1 ${sample}_fastp_filtered_R1_temp.fq.gz \
--out2 ${sample}_fastp_filtered_R2_temp.fq.gz \
--in1 *R1*.fastq.gz \
--in2 *R2*.fastq.gz
#Number reads from fastqs
num_1=$(grep before_filtering -m 1 -A1 ${sample}_temp.json | grep total | sed "s/.*://g" | sed "s/,//g")
echo "${sample} ${num_1}" >> ${main_dir}/tem_all.txt
#Percentage of duplication
dup_rate=$(echo "$(grep duplication -m 1 -A1 ${sample}_temp.json | grep rate | sed "s/.*://g" | sed "s/,//g")*100" | bc)
echo "${sample} ${dup_rate}" >> ${main_dir}/tem_dup_rate.txt
#Second fastp. To trim primers (25pb from 5') and 5pb in 3'
fastp --trim_front1 25 --trim_tail1 5 \
--trim_front2 25 --trim_tail2 5 \
--out1 ${sample}_nonprimer_R1_temp.fq.gz \
--out2 ${sample}_nonprimer_R2_temp.fq.gz \
--in1 ${sample}_fastp_filtered_R1_temp.fq.gz \
--in2 ${sample}_fastp_filtered_R2_temp.fq.gz
#Third fastp. To trim bad quality bases
fastp --cut_tail --cut_tail_mean_quality 20 \
--trim_poly_x \
--merge --overlap_len_require 10 \
--correction \
--html ${sample}_temp.html \
--json ${sample}_temp.json \
--merged_out ${sample}_M.fq.gz \
--out1 ${sample}_R1.fq.gz \
--out2 ${sample}_R2.fq.gz \
--in1 ${sample}_nonprimer_R1_temp.fq.gz \
--in2 ${sample}_nonprimer_R2_temp.fq.gz
rm *_temp*
#Quality is checked with FastQC
fastq_file=2_fastqc_file_quality_post-trimming
if [[ -d ${fastq_file} ]]; then
echo "Directory ${fastq_file} already exists"
else
mkdir ${fastq_file}
fastqc -o ${fastq_file} -f fastq *.fq.gz
fi
#Alignment is done with bwa aln and bwa samse/sampe
bwa aln -t 9 -l 1024 -n 0.01 -o 2 ${NCR} ${sample}_R1.fq.gz > tem_aln_R1.sai
bwa aln -t 9 -l 1024 -n 0.01 -o 2 ${NCR} ${sample}_R2.fq.gz > tem_aln_R2.sai
bwa aln -t 9 -l 1024 -n 0.01 -o 2 ${NCR} ${sample}_M.fq.gz > tem_aln_M.sai
bwa sampe ${NCR} tem_aln_R1.sai tem_aln_R2.sai ${sample}_R1.fq.gz ${sample}_R2.fq.gz | samtools view -Sbh -F4 -q30 | samtools sort -O bam -o tem_${sample}_NM_map.bam
bwa samse ${NCR} tem_aln_M.sai ${sample}_M.fq.gz | samtools view -Sbh -F4 -q30 | samtools sort -O bam -o tem_${sample}_M_map.bam
samtools merge -o tem_${sample}_map_unsort.bam tem_${sample}_NM_map.bam tem_${sample}_M_map.bam; samtools sort -O bam -o final_${sample}.bam tem_${sample}_map_unsort.bam
bam=final_${sample}.bam
samtools index ${bam}
#The number of reads with good quality is determined
num_2=$(samtools view -c ${bam})
echo "${sample} ${num_2}" >> ${main_dir}/tem_useful.txt
#Reads with post-mortem molecular (PMD) damage are extracted
samtools view -h ${bam} | pmdtools --threshold 1 --header | samtools view -Sb - > pmd_final_${sample}.bam
pam=pmd_final_${sample}.bam
samtools index ${pam}
#The number of reads with PMD is determined
num_3=$(samtools view -c ${pam})
echo "${sample} ${num_3}" >> ${main_dir}/tem_useful_pmd.txt
#Quality and mean depth coverage of bams is checked for the file with all reads
qualimap bamqc -bam ${bam} -c -gd hg19 -outdir 3_qualimap_bamqc
qual=$(grep "mean mapping quality" 3_qualimap_bamqc/genome_results.txt | sed 's/mean mapping quality = //')
echo "${sample} ${qual}" >> ${main_dir}/tem_qual.txt
cov=$(grep "mean coverageData" 3_qualimap_bamqc/genome_results.txt | sed 's/mean coverageData = //' | sed 's/X//' | sed 's/,//')
echo "${sample} ${cov}" >> ${main_dir}/tem_cov.txt
#Range of positions with a chosen depth coverage is generated
echo "
Ranges of positions are being determined...
"
#Depth coverage over chosen value in file with all reads
samtools depth ${bam} | awk -v DEPTHCOV=${depthcov} '$3>=DEPTHCOV {print $2}' | awk '{if ($1<670) print $1+15900; else print $1-669}' | awk '$1 < 303 || $1 > 315 {print $1}' | sort -n > temp.txt
range=$(python ${main_dir}/range.py temp.txt | sed "s/[',\[\)\( ]//g" | sed "s/]//g" | sed "s/;$//")
echo "${sample}_${depthcov} ${range}" >> ${main_dir}/tem_range_${depthcov}.txt
#Depth coverage over DEPTHCOVX in file with damaged reads
samtools depth ${pam} | awk -F "\t" -v DEPTHCOV=${depthcov} '$3>=DEPTHCOV {print($2)}' | awk '{if ($1<670) print $1+15900; else print $1-669}' | awk '$1 < 303 || $1 > 315 {print $1}' | sort -n > temp_pmd.txt
range_pmd=$(python ${main_dir}/range.py temp_pmd.txt | sed "s/[',\[\)\( ]//g" | sed "s/]//g" | sed "s/;$//")
echo "${sample}_${depthcov}_pmd ${range_pmd}" >> ${main_dir}/tem_range_pmd_${depthcov}.txt
rm temp_pmd.txt
bases_with_good_cov=$(samtools depth ${bam} | awk -v DEPTHCOV=${depthcov} '$3>=DEPTHCOV {print $2"\t"1}' | awk '{sum += $2} END {print sum}')
echo -e "${sample}\t${bases_with_good_cov}" >> ${main_dir}/tem_goodcovinsamples_${depthcov}.txt
#Variant call is done with freebayes and vcfallelicprimitives
echo "
Starting the variant call
"
#All reads, chosen depth coverage
maxdepthcov=$(echo "100-(${freq}*100)" | bc)
freebayes -f ${NCR} -i -X -F ${freq} -C ${minaltcount} --min-coverage ${depthcov} ${bam} | vcfallelicprimitives -kg > ${bam}_${depthcov}.vcf
awk '$3 == "." && length($5) == 1 && length($4) == 1 {print $2"\t"$4"\t"$5"\t"$10}' ${bam}_${depthcov}.vcf | sed 's/:/\t/g' | awk '{print $1"\t"$2"\t"$3"\t"$5"\t"$6"\t"100*$9/$5}' | awk -v MAXDEPTHCOV=${maxdepthcov} '{if ($6 < MAXDEPTHCOV) print $1"\t"$2$3; else print $1"\t"$3;}' | sed 's/,.*//' | sed 's/[AGTC]\{4\}/N/g' | sed 's/[ACG]\{3\}/V/g' |sed 's/[ATC]\{3\}/H/g' | sed 's/[ATG]\{3\}/D/g' | sed 's/[GTC]\{3\}/B/g' | sed 's/[AC]\{2\}/M/g' | sed 's/[TG]\{2\}/K/g' | sed 's/[AT]\{2\}/W/g' | sed 's/[GC]\{2\}/S/g' | sed 's/[TC]\{2\}/Y/g' | sed 's/[AG]\{2\}/R/g' | awk '{if ($1<670) print $1+15900$2; else print $1-669$2}' | sed -e 's/\t$//g' | awk -F"\t" '$1 < 303 || $1 > 315 {print $1$2}' > ${bam}_${depthcov}.txt
#Damaged reads, chosen depth coverage
freebayes -f ${NCR} -i -X -F ${freq} -C ${minaltcount} --min-coverage ${depthcov} ${pam} | vcfallelicprimitives -kg > ${pam}_${depthcov}.vcf
awk '$3 == "." && length($5) == 1 && length($4) == 1 {print $2"\t"$4"\t"$5"\t"$10}' ${pam}_${depthcov}.vcf | sed 's/:/\t/g' | awk '{print $1"\t"$2"\t"$3"\t"$5"\t"$6"\t"100*$9/$5}' | awk -v MAXDEPTHCOV=${maxdepthcov} '{if ($6 < MAXDEPTHCOV) print $1"\t"$2$3; else print $1"\t"$3;}' | sed 's/[AGTC]\{4\}/N/g' | sed 's/[ACG]\{3\}/V/g' |sed 's/[ATC]\{3\}/H/g' | sed 's/[ATG]\{3\}/D/g' | sed 's/[GTC]\{3\}/B/g' | sed 's/[AC]\{2\}/M/g' | sed 's/[TG]\{2\}/K/g' | sed 's/[AT]\{2\}/W/g' | sed 's/[GC]\{2\}/S/g' | sed 's/[TC]\{2\}/Y/g' | sed 's/[AG]\{2\}/R/g' | awk '{if ($1<670) print $1+15900$2; else print $1-669$2}' | sed -e 's/\t$//g' | awk -F"\t" '$1 < 303 || $1 > 315 {print $1$2}' > ${pam}_${depthcov}.txt
#Documents with mixtures are done for all reads
awk '$3 == "." && length($4) == 1 && length($5) == 1 {if ($2<670) print $2+15900"\t"$5"\t"$10; else print $2-669"\t"$5"\t"$10}' ${bam}_${depthcov}.vcf | awk '$1 < 303 || $1 > 315 {print $0}' | sed 's/:/\t/g' | awk '{print $1"\t"$2"\t"$4"\t"$8"\t"100*$8/$4"\t"}' | awk -v vd=${sample} -v MAXDEPTHCOV=${maxdepthcov} '$5 < MAXDEPTHCOV {print vd"\t"$0}' >> ${main_dir}/base_mix_depthcov-${depthcov}.txt
cp ${bam}_${depthcov}.txt ${main_dir}/${vis}
cp ${pam}_${depthcov}.txt ${main_dir}/${vis}
rm *tem*
echo "
Changing sample...
"
cd ${main_dir}
done < samples.txt
#File in hsd format (the one requiered by HaploGrep2) is generated in "Visualization directory"
cd ${main_dir}/${vis}
echo "
Creating the hsd file to use in HaploGrep2
"
echo "Sample Range Haplogroup Haplotype" > tem_haplogrep_format.txt
#Variable with the same value for all samples is generated
haplogroup="?"
#The rest of the variables are generated line by line
for t in *final*bam_*.txt; do
#Name is extracted
sample="${t//*final_/}"
if [[ ${t} == pmd* ]]
then
sample="${sample//.bam_${depthcov}.*/_${depthcov}_pmd}"
ranges=$(awk -v SAMPLE=${sample} '$1==SAMPLE {print $2}' ${main_dir}/tem_range_pmd_${depthcov}.txt)
else
sample="${sample//.bam_${depthcov}.*/_${depthcov}}"
ranges=$(awk -v SAMPLE=${sample} '$1==SAMPLE {print $2}' ${main_dir}/tem_range_${depthcov}.txt)
fi
#Fake tabulators are converted into real ones
sed -i 's/ /\t/g' ${t}
#Polimoprhisms are print in a temporal file (vertical)
awk -F"\t" '{print $1 $2}' ${t} > temp_file.txt
#Colum is transformed into a row and save in an object
haplotype=$(for i in `< temp_file.txt`; do echo -en ${i}" "; done)
#Row is added in the order determined by HaploGrep2 (with spaces instead of tabulators)
echo "${sample} ${ranges} ${haplogroup} ${haplotype}" >> tem_haplogrep_format.txt
done
#Spaces are transformed into tabulators
sed 's/ /\t/g' tem_haplogrep_format.txt | sed 's/\t$//g' | awk -F "\t" '{ if ($4 == "") print $0"\t16519T"; else print $0}' > haplogrep_format.txt
cp haplogrep_format.txt ${main_dir}
rm *tem*
cd ${main_dir}
echo "
Starting HaploGrep2...
"
grep -v pmd haplogrep_format.txt > tem_haplogrep_format.txt
bash haplogrep classify --in tem_haplogrep_format.txt --format hsd --out tem_haplos.txt
echo "
Generating final table...
"
echo -e 'ID\tTotal_reads\tDuplication_rate\tUseful_reads\tPercentage_of_useful_reads\tMean_depth_coverage\tMean_mapping_quality\tDamaged_useful_reads\tPercentage_of_Damaged_reads\tMixed_bases\tHaplogrouop\tQuality\tPercentage_of_NCR_recovered\tRange\tHaplotype' > final_table.txt
while read sample; do
echo "${sample}" | awk -v v0=${sample} \
-v v1=$(awk -v v1_1=${sample} '$1==v1_1 {print $2}' tem_all.txt) \
-v v2=$(awk -v v2_1=${sample} '$1==v2_1 {print $2}' tem_dup_rate.txt) \
-v v3=$(awk -v v3_1=${sample} '$1==v3_1 {print $2}' tem_useful.txt) \
-v v4=$(awk -v v4_1=${sample} '$1==v4_1 {print $2}' tem_cov.txt) \
-v v5=$(awk -v v5_1=${sample} '$1==v5_1 {print $2}' tem_qual.txt) \
-v v6=$(awk -v v6_1=${sample} '$1==v6_1 {print $2}' tem_useful_pmd.txt) \
-v v7=$(awk -v v7_1=${sample} '$1==v7_1 {print $0}' base_mix_depthcov-${depthcov}.txt | wc -l) \
-v v8=$(awk -v v8_1=\"${sample}_${depthcov}\" '$1==v8_1 {print $2"\t"$4}' tem_haplos.txt | sed 's/"//g' | sed 's/\t/--/g') \
-v v9=$(awk -v v9_1=${sample} '$1==v9_1 {print $2}' tem_goodcovinsamples_${depthcov}.txt) \
-v v10=$(awk -v v10_1=${sample}_${depthcov} '$1==v10_1 {print $2}' tem_range_${depthcov}.txt) \
-v v11=$(awk -v v11_1=${sample}_${depthcov} '$1==v11_1 {print $0}' tem_haplogrep_format.txt | cut -f 4- | sed 's/\t/__/g') \
'$1==v0 {print v0"\t"v1"\t"v2"\t"v3"\t"v3/v1*100"\t"v4"\t"v5"\t"v6"\t"v6/v3*100"\t"v7"\t"v8"\t"100*v9/1198"\t"v10"\t"v11}' \
| sed 's/--/\t/g' | sed 's/__/ /g' | sed 's/16519T$//g' | sed 's/\t16519T\t/\t\t/g' |sed 's/,/\./g' | sort -h >> final_table.txt
done < samples.txt
rm *tem*.txt
rm -r ${vis}
echo "
THE END
"