-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathsplit_fastq_check_fastq_input_vs_split_output_singletons.sh
executable file
·76 lines (61 loc) · 2.68 KB
/
split_fastq_check_fastq_input_vs_split_output_singletons.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/bin/bash
##########################################################################
#
# Platform: NCI Gadi HPC
# Usage: this script is run by split_fastq_check_fastq_input_vs_split_output_run_parallel.pbs
# Version: 2.0
#
# For more details see: https://github.com/Sydney-Informatics-Hub/Fastq-to-BAM
#
# If you use this script towards a publication, please acknowledge the
# Sydney Informatics Hub (or co-authorship, where appropriate).
#
# Suggested acknowledgement:
# The authors acknowledge the support provided by the Sydney Informatics Hub,
# a Core Research Facility of the University of Sydney. This research/project
# was undertaken with the assistance of resources and services from the National
# Computational Infrastructure (NCI), which is supported by the Australian
# Government, and the Australian BioCommons which is enabled by NCRIS via
# Bioplatforms Australia funding.
#
##########################################################################
#1 - Get original number of pairs from fastQC data.txt
#2 - Get number of pairs processed from fastp logs
#3 - Get number of pairs output from line-counting the split fastq
#4 - print warning if mismatch detected at any step; print "sampe OK" if no errors
fastq_name=$1
fastq=$(basename $fastq_name | sed 's/.fastq.gz//')
out=./Check_fastq_split/${fastq}.check
rm -rf $out
err=0
#1 - Get original number of reads from fastQC data.txt
qc1=$(ls ./FastQC/${fastq}*fastqc/fastqc_data.txt)
fastqc_1=$(grep "Total Sequences" $qc1 | awk '{print $3}')
#2 - Get number of reads processed from fastp logs
fastp_log=./Logs/Fastp/${fastq}.log
fastp_1=$(grep -A 1 "Read1 before filtering" $fastp_log | tail -1 | awk '{print $3}')
fastp_tot=$(grep -m 1 "reads passed filter" $fastp_log | awk '{print $4}')
if [[ $fastp_1 -ne $fastp_tot ]]
then
printf "$fastq error: fastp reads before and after filtering (no filtering applied) do not match - $fastp_1 and $fastp_tot respectively\n" >> $out
((err++))
fi
if [[ $fastp_1 -ne $fastqc_1 ]]
then
printf "$fastq error: fastp and fastQC read counts do not match - $fastp_1 and $fastqc_1 respectively\n" >> $out
((err++))
fi
#3 - Get number of reads output from line-counting the split fastq
splits=./Fastq_split
split_1_lines=$(ls ${splits}/*${fastq}*f*q.gz | parallel -j $NCPUS --will-cite "zcat {} | wc -l " | awk '{s+=$1} END {print s}') # check regex
split_pairs=$(expr $split_1_lines \/ 4)
#Check 3 sources
if [[ $fastp_1 -ne $split_pairs ]]
then
printf "$fastq error: fastp input and fastp split output read counts do not match - $fastp_1 and $split_pairs respectively\n" >> $out
((err++))
fi
if [[ $err -eq 0 ]]
then
printf "$fastq has passed all checks\n" >> $out
fi