diff --git a/tests/make-test.py b/tests/make-test.py new file mode 100755 index 0000000..0c902fb --- /dev/null +++ b/tests/make-test.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +import os,sys,json,re + +def usage(): + print('Usage: make-test.py [OPTIONS] index.ipynb answers.ipynb') + print('Options:') + print(' -o, --output FILE Output file') + sys.exit(1) + +def parse_opts(): + opts = {} + opts['files'] = [] + args = sys.argv[1:] + while len(args): + if args[0]=='-o' or args[0]=='--output': + args = args[1:] + opts['outfile'] = args[0] + elif os.path.isfile(args[0]): + opts['files'].append(args[0]) + else: + usage() + args = args[1:] + if len(opts['files']) == 0: usage() + if 'outfile' in opts: opts['fh'] = open(opts['outfile'],"w") + else: opts['fh'] = sys.stdout + return opts + +def process_file(opts,file): + nb = json.load(open(file)) + opts['fh'].write("\n# "+file+"\n") + next_file = None + lines = [] + for cell in nb['cells']: + if cell['cell_type'] == 'markdown': + for line in cell['source']: + m = re.search(r'\(([^(]+\.ipynb)\)',line); + if m!=None: next_file = m.group(1) + m = re.search(r'^\s*`(.+)`\s*$',line) + if m!=None and m.group(1)!='`' : lines.append(m.group(1)) + if cell['cell_type'] == 'code': + for line in cell['source']: + line = line.rstrip('\n') + if len(lines)>0 and lines[-1][-1]=='\\': + lines[-1] = lines[-1].rstrip('\\') + lines[-1] += line + else: + lines.append(line) + for line in lines: + line = re.sub(r'\|\s*less -S\s*$', '| head',line) + line = re.sub(r'\|\s*less\s*$', '| head',line) + line = re.sub(r'less\s+-S\s+', 'tail ',line) + line = re.sub(r'less\s+', 'tail ',line) + opts['fh'].write(line+"\n") + opts['fh'].write("\n") + if next_file!=None: + dir = '.' + m = re.search(r'^(.*)/[^/]+$',file); + if m!=None and m.group(1)!='': dir = m.group(1) + opts['files'].append(dir+'/'+next_file) + +def main(opts): + opts['fh'].write("#!/bin/bash\n") + opts['fh'].write("set -e\n") + opts['fh'].write("set -x\n") + + for file in opts['files']: + process_file(opts,file) + + opts['fh'].close() + +if __name__ == "__main__": + opts = parse_opts() + main(opts) + diff --git a/tests/test_data_formats.log b/tests/test_data_formats.log new file mode 100644 index 0000000..936c7b7 --- /dev/null +++ b/tests/test_data_formats.log @@ -0,0 +1,558 @@ ++ cd /home/manager/course_data/data_formats/ ++ samtools --help + +Program: samtools (Tools for alignments in the SAM format) +Version: 1.10 (using htslib 1.10.2) + +Usage: samtools [options] + +Commands: + -- Indexing + dict create a sequence dictionary file + faidx index/extract FASTA + fqidx index/extract FASTQ + index index alignment + + -- Editing + calmd recalculate MD/NM tags and '=' bases + fixmate fix mate information + reheader replace BAM header + targetcut cut fosmid regions (for fosmid pool only) + addreplacerg adds or replaces RG tags + markdup mark duplicates + + -- File operations + collate shuffle and group alignments by name + cat concatenate BAMs + merge merge sorted alignments + mpileup multi-way pileup + sort sort alignment file + split splits a file by read group + quickcheck quickly check if SAM/BAM/CRAM file appears intact + fastq converts a BAM to a FASTQ + fasta converts a BAM to a FASTA + + -- Statistics + bedcov read depth per BED region + coverage alignment depth and percent coverage + depth compute the depth + flagstat simple stats + idxstats BAM index stats + phase phase heterozygotes + stats generate stats (former bamcheck) + + -- Viewing + flags explain BAM flags + tview text alignment viewer + view SAM<->BAM<->CRAM conversion + depad convert padded BAM to unpadded BAM + ++ bcftools --help + +Program: bcftools (Tools for variant calling and manipulating VCFs and BCFs) +License: GNU GPLv3+, due to use of the GNU Scientific Library +Version: 1.10.2 (using htslib 1.10.2) + +Usage: bcftools [--version|--version-only] [--help] + +Commands: + + -- Indexing + index index VCF/BCF files + + -- VCF/BCF manipulation + annotate annotate and edit VCF/BCF files + concat concatenate VCF/BCF files from the same set of samples + convert convert VCF/BCF files to different formats and back + isec intersections of VCF/BCF files + merge merge VCF/BCF files files from non-overlapping sample sets + norm left-align and normalize indels + plugin user-defined plugins + query transform VCF/BCF into user-defined formats + reheader modify VCF/BCF header, change sample names + sort sort VCF/BCF file + view VCF/BCF conversion, view, subset and filter VCF/BCF files + + -- VCF/BCF analysis + call SNP/indel calling + consensus create consensus sequence by applying VCF variants + cnv HMM CNV calling + csq call variation consequences + filter filter VCF/BCF files using fixed thresholds + gtcheck check sample concordance, detect sample swaps and contamination + mpileup multi-way pileup producing genotype likelihoods + polysomy detect number of chromosomal copies + roh identify runs of autozygosity (HMM) + stats produce VCF/BCF stats + + Most commands accept VCF, bgzipped VCF, and BCF with the file type detected + automatically even when streaming from a pipe. Indexed VCF and BCF will work + in all situations. Un-indexed VCF and BCF and streams will work in most but + not all situations. + ++ pwd +/home/manager/course_data/data_formats ++ perl -e 'printf "%d\n",ord("A")-33;' +32 ++ cat data/example.sam +ERR003814.1408899 163 1 19999970 23 37M = 20000147 213 GGTGGGTGGATCACCTGAGATCGGGAGTTTGAGACTA <=@A@??@=@A@A>@BAA@ABA:>@<>=BBB9@@2B3 X0:i:1 X1:i:20 MD:Z:37 RG:Z:ERR003814 AM:i:0 NM:i:0 SM:i:23 MQ:i:29 XT:A:U BQ:Z:@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +ERR003762.5016205 163 1 20000006 48 37M = 20000026 55 AACATGGAGAAACCCCATCTCTACTAAGGTACAAAAA @A><@A@?@BA@?B<@B>?@AAB??A5>B@?@AACAB?BC?BCBBBBBC@BCCBBBCBBBBBCBBBCBACABAC:AACCAB?A@6? X0:i:1 X1:i:0 MD:Z:108 RG:Z:ERR015472 AM:i:37 NM:i:0 SM:i:37 MQ:i:60 XT:A:U BQ:Z:@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@A +ERR015472.13730806 83 1 20000013 60 108M = 19999720 -400 AGAAACCCCATCTCTACTAAGGTACAAAATTAGATGGGCATGGTGGTGCACACCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAG 91);08;;?=@@@?67?==?-;?@;@@?A?@@9A=>@A>?AB?AB@@BBBCABC@BBBBBBCBBBBACBABCBBCBBAB:BBCCAB?BB:BA@A? X0:i:1 X1:i:0 MD:Z:108 RG:Z:ERR015472 AM:i:37 NM:i:0 SM:i:37 MQ:i:60 XT:A:U BQ:Z:@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@A +ERR003814.6979522 147 1 20000095 29 37M = 19999855 -276 GCAGGAGAATCGCTTGAACCCGGGAGGCAGAGGTTGT :@A3A8>< X0:i:279 MD:Z:37 RG:Z:ERR003814 AM:i:0 NM:i:0 SM:i:0 MQ:i:37 XT:A:R BQ:Z:@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + ++ samtools view -H data/NA20538.bam +@HD VN:1.0 GO:none SO:coordinate +@SQ SN:1 LN:249250621 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:1b22b98cdeb4a9304cb5d48026a85128 SP:Human +@SQ SN:2 LN:243199373 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:a0d9851da00400dec1098a9255ac712e SP:Human +@SQ SN:3 LN:198022430 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:fdfd811849cc2fadebc929bb925902e5 SP:Human +@SQ SN:4 LN:191154276 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:23dccd106897542ad87d2765d28a19a1 SP:Human +@SQ SN:5 LN:180915260 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:0740173db9ffd264d728f32784845cd7 SP:Human +@SQ SN:6 LN:171115067 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:1d3a93a248d92a729ee764823acbbc6b SP:Human +@SQ SN:7 LN:159138663 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:618366e953d6aaad97dbe4777c29375e SP:Human +@SQ SN:8 LN:146364022 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:96f514a9929e410c6651697bded59aec SP:Human +@SQ SN:9 LN:141213431 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:3e273117f15e0a400f01055d9f393768 SP:Human +@SQ SN:10 LN:135534747 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:988c28e000e84c26d552359af1ea2e1d SP:Human +@SQ SN:11 LN:135006516 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:98c59049a2df285c76ffb1c6db8f8b96 SP:Human +@SQ SN:12 LN:133851895 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:51851ac0e1a115847ad36449b0015864 SP:Human +@SQ SN:13 LN:115169878 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:283f8d7892baa81b510a015719ca7b0b SP:Human +@SQ SN:14 LN:107349540 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:98f3cae32b2a2e9524bc19813927542e SP:Human +@SQ SN:15 LN:102531392 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:e5645a794a8238215b2cd77acb95a078 SP:Human +@SQ SN:16 LN:90354753 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:fc9b1a7b42b97a864f56b348b06095e6 SP:Human +@SQ SN:17 LN:81195210 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:351f64d4f4f9ddd45b35336ad97aa6de SP:Human +@SQ SN:18 LN:78077248 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:b15d4b2d29dde9d3e4f93d1d0f2cbc9c SP:Human +@SQ SN:19 LN:59128983 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:1aacd71f30db8e561810913e0b72636d SP:Human +@SQ SN:20 LN:63025520 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:0dec9660ec1efaaf33281c0d5ea2560f SP:Human +@SQ SN:21 LN:48129895 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:2979a6085bfe28e3ad6f552f361ed74d SP:Human +@SQ SN:22 LN:51304566 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:a718acaa6135fdca8357d5bfe94211dd SP:Human +@SQ SN:X LN:155270560 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:7e0e2e580297b7764e31dbc80c2540dd SP:Human +@SQ SN:Y LN:59373566 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:1fa3474750af0948bdf97d5a0ee52e51 SP:Human +@SQ SN:MT LN:16569 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:c68f52674c9fb33aef52dcf399755519 SP:Human +@SQ SN:GL000207.1 LN:4262 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:f3814841f1939d3ca19072d9e89f3fd7 SP:Human +@SQ SN:GL000226.1 LN:15008 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:1c1b2cd1fccbc0a99b6a447fa24d1504 SP:Human +@SQ SN:GL000229.1 LN:19913 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:d0f40ec87de311d8e715b52e4c7062e1 SP:Human +@SQ SN:GL000231.1 LN:27386 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:ba8882ce3a1efa2080e5d29b956568a4 SP:Human +@SQ SN:GL000210.1 LN:27682 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:851106a74238044126131ce2a8e5847c SP:Human +@SQ SN:GL000239.1 LN:33824 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:99795f15702caec4fa1c4e15f8a29c07 SP:Human +@SQ SN:GL000235.1 LN:34474 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:118a25ca210cfbcdfb6c2ebb249f9680 SP:Human +@SQ SN:GL000201.1 LN:36148 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:dfb7e7ec60ffdcb85cb359ea28454ee9 SP:Human +@SQ SN:GL000247.1 LN:36422 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:7de00226bb7df1c57276ca6baabafd15 SP:Human +@SQ SN:GL000245.1 LN:36651 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:89bc61960f37d94abf0df2d481ada0ec SP:Human +@SQ SN:GL000197.1 LN:37175 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:6f5efdd36643a9b8c8ccad6f2f1edc7b SP:Human +@SQ SN:GL000203.1 LN:37498 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:96358c325fe0e70bee73436e8bb14dbd SP:Human +@SQ SN:GL000246.1 LN:38154 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:e4afcd31912af9d9c2546acf1cb23af2 SP:Human +@SQ SN:GL000249.1 LN:38502 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:1d78abec37c15fe29a275eb08d5af236 SP:Human +@SQ SN:GL000196.1 LN:38914 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:d92206d1bb4c3b4019c43c0875c06dc0 SP:Human +@SQ SN:GL000248.1 LN:39786 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:5a8e43bec9be36c7b49c84d585107776 SP:Human +@SQ SN:GL000244.1 LN:39929 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:0996b4475f353ca98bacb756ac479140 SP:Human +@SQ SN:GL000238.1 LN:39939 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:131b1efc3270cc838686b54e7c34b17b SP:Human +@SQ SN:GL000202.1 LN:40103 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:06cbf126247d89664a4faebad130fe9c SP:Human +@SQ SN:GL000234.1 LN:40531 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:93f998536b61a56fd0ff47322a911d4b SP:Human +@SQ SN:GL000232.1 LN:40652 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:3e06b6741061ad93a8587531307057d8 SP:Human +@SQ SN:GL000206.1 LN:41001 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:43f69e423533e948bfae5ce1d45bd3f1 SP:Human +@SQ SN:GL000240.1 LN:41933 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:445a86173da9f237d7bcf41c6cb8cc62 SP:Human +@SQ SN:GL000236.1 LN:41934 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:fdcd739913efa1fdc64b6c0cd7016779 SP:Human +@SQ SN:GL000241.1 LN:42152 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:ef4258cdc5a45c206cea8fc3e1d858cf SP:Human +@SQ SN:GL000243.1 LN:43341 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:cc34279a7e353136741c9fce79bc4396 SP:Human +@SQ SN:GL000242.1 LN:43523 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:2f8694fc47576bc81b5fe9e7de0ba49e SP:Human +@SQ SN:GL000230.1 LN:43691 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:b4eb71ee878d3706246b7c1dbef69299 SP:Human +@SQ SN:GL000237.1 LN:45867 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:e0c82e7751df73f4f6d0ed30cdc853c0 SP:Human +@SQ SN:GL000233.1 LN:45941 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:7fed60298a8d62ff808b74b6ce820001 SP:Human +@SQ SN:GL000204.1 LN:81310 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:efc49c871536fa8d79cb0a06fa739722 SP:Human +@SQ SN:GL000198.1 LN:90085 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:868e7784040da90d900d2d1b667a1383 SP:Human +@SQ SN:GL000208.1 LN:92689 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:aa81be49bf3fe63a79bdc6a6f279abf6 SP:Human +@SQ SN:GL000191.1 LN:106433 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:d75b436f50a8214ee9c2a51d30b2c2cc SP:Human +@SQ SN:GL000227.1 LN:128374 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:a4aead23f8053f2655e468bcc6ecdceb SP:Human +@SQ SN:GL000228.1 LN:129120 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:c5a17c97e2c1a0b6a9cc5a6b064b714f SP:Human +@SQ SN:GL000214.1 LN:137718 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:46c2032c37f2ed899eb41c0473319a69 SP:Human +@SQ SN:GL000221.1 LN:155397 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:3238fb74ea87ae857f9c7508d315babb SP:Human +@SQ SN:GL000209.1 LN:159169 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:f40598e2a5a6b26e84a3775e0d1e2c81 SP:Human +@SQ SN:GL000218.1 LN:161147 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:1d708b54644c26c7e01c2dad5426d38c SP:Human +@SQ SN:GL000220.1 LN:161802 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:fc35de963c57bf7648429e6454f1c9db SP:Human +@SQ SN:GL000213.1 LN:164239 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:9d424fdcc98866650b58f004080a992a SP:Human +@SQ SN:GL000211.1 LN:166566 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:7daaa45c66b288847b9b32b964e623d3 SP:Human +@SQ SN:GL000199.1 LN:169874 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:569af3b73522fab4b40995ae4944e78e SP:Human +@SQ SN:GL000217.1 LN:172149 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:6d243e18dea1945fb7f2517615b8f52e SP:Human +@SQ SN:GL000216.1 LN:172294 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:642a232d91c486ac339263820aef7fe0 SP:Human +@SQ SN:GL000215.1 LN:172545 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:5eb3b418480ae67a997957c909375a73 SP:Human +@SQ SN:GL000205.1 LN:174588 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:d22441398d99caf673e9afb9a1908ec5 SP:Human +@SQ SN:GL000219.1 LN:179198 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:f977edd13bac459cb2ed4a5457dba1b3 SP:Human +@SQ SN:GL000224.1 LN:179693 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:d5b2fc04f6b41b212a4198a07f450e20 SP:Human +@SQ SN:GL000223.1 LN:180455 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:399dfa03bf32022ab52a846f7ca35b30 SP:Human +@SQ SN:GL000195.1 LN:182896 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:5d9ec007868d517e73543b005ba48535 SP:Human +@SQ SN:GL000212.1 LN:186858 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:563531689f3dbd691331fd6c5730a88b SP:Human +@SQ SN:GL000222.1 LN:186861 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:6fe9abac455169f50470f5a6b01d0f59 SP:Human +@SQ SN:GL000200.1 LN:187035 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:75e4c8d17cd4addf3917d1703cacaf25 SP:Human +@SQ SN:GL000193.1 LN:189789 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:dbb6e8ece0b5de29da56601613007c2a SP:Human +@SQ SN:GL000194.1 LN:191469 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:6ac8f815bf8e845bb3031b73f812c012 SP:Human +@SQ SN:GL000225.1 LN:211173 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:63945c3e6962f28ffd469719a747e73c SP:Human +@SQ SN:GL000192.1 LN:547496 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:325ba9e808f669dfeee210fdd7b470ac SP:Human +@RG ID:ERR003612 PL:ILLUMINA LB:g1k-sc-NA20538-TOS-1 PI:2000 DS:SRP000540 SM:NA20538 CN:SC +@RG ID:ERR003761 PL:ILLUMINA LB:g1k-sc-NA20538-TOS-1 PI:2000 DS:SRP000540 SM:NA20538 CN:SC +@RG ID:ERR003762 PL:ILLUMINA LB:g1k-sc-NA20538-TOS-1 PI:2000 DS:SRP000540 SM:NA20538 CN:SC +@RG ID:ERR003763 PL:ILLUMINA LB:g1k-sc-NA20538-TOS-1 PI:2000 DS:SRP000540 SM:NA20538 CN:SC +@RG ID:ERR003764 PL:ILLUMINA LB:g1k-sc-NA20538-TOS-1 PI:2000 DS:SRP000540 SM:NA20538 CN:SC +@RG ID:ERR003765 PL:ILLUMINA LB:g1k-sc-NA20538-TOS-1 PI:2000 DS:SRP000540 SM:NA20538 CN:SC +@RG ID:ERR003766 PL:ILLUMINA LB:g1k-sc-NA20538-TOS-1 PI:2000 DS:SRP000540 SM:NA20538 CN:SC +@RG ID:ERR003767 PL:ILLUMINA LB:g1k-sc-NA20538-TOS-1 PI:2000 DS:SRP000540 SM:NA20538 CN:SC +@RG ID:ERR003811 PL:ILLUMINA LB:g1k-sc-NA20538-TOS-1 PI:2000 DS:SRP000540 SM:NA20538 CN:SC +@RG ID:ERR003812 PL:ILLUMINA LB:g1k-sc-NA20538-TOS-1 PI:2000 DS:SRP000540 SM:NA20538 CN:SC +@RG ID:ERR003813 PL:ILLUMINA LB:g1k-sc-NA20538-TOS-1 PI:2000 DS:SRP000540 SM:NA20538 CN:SC +@RG ID:ERR003814 PL:ILLUMINA LB:g1k-sc-NA20538-TOS-1 PI:2000 DS:SRP000540 SM:NA20538 CN:SC +@RG ID:ERR003815 PL:ILLUMINA LB:g1k-sc-NA20538-TOS-1 PI:2000 DS:SRP000540 SM:NA20538 CN:SC +@RG ID:ERR003816 PL:ILLUMINA LB:g1k-sc-NA20538-TOS-1 PI:2000 DS:SRP000540 SM:NA20538 CN:SC +@RG ID:ERR015472 PL:ILLUMINA LB:g1k-sc-NA20538-A PI:450 DS:SRP000540 SM:NA20538 CN:SC +@PG ID:GATK IndelRealigner VN:1.0.4487 CL:SNPsFileForDebugging=null targetIntervals=/lustre/scratch105/projects/g1k/ref/broad_recal_data/pilot_data/indel.dbsnp_129_b37-vs-pilot.intervals output=null useOnlyKnownIndels=true maxReadsForConsensuses=120 maxConsensuses=30 entropyThreshold=0.15 indelsFileForDebugging=null noOriginalAlignmentTags=false realignReadsWithBadMates=false maxReadsForRealignment=20000 noPGTag=false LODThresholdForCleaning=0.4 maxReadsInRam=500000 targetIntervalsAreNotSorted=false sortInCoordinateOrderEvenThoughItIsHighlyUnsafe=false statisticsFileForDebugging=null +@PG ID:GATK TableRecalibration VN:1.0.4487 CL:output_bam=null window_size_nqs=5 force_read_group=null smoothing=1 default_platform=ILLUMINA exception_if_no_tile=false homopolymer_nback=7 no_pg_tag=false skipUQUpdate=false default_read_group=RG max_quality_score=40 fail_with_no_eof_marker=true solid_recal_mode=SET_Q_ZERO solid_nocall_strategy=THROW_EXCEPTION force_platform=null preserve_qscores_less_than=5 Covariates=[ReadGroupCovariate, QualityScoreCovariate, CycleCovariate, DinucCovariate] pQ=5 maxQ=40 smoothing=1 +@PG ID:bwa VN:0.5.5 +@PG ID:samtools PN:samtools PP:GATK IndelRealigner VN:1.10 CL:samtools view -H data/NA20538.bam +@PG ID:samtools.1 PN:samtools PP:GATK TableRecalibration VN:1.10 CL:samtools view -H data/NA20538.bam +@PG ID:samtools.2 PN:samtools PP:bwa VN:1.10 CL:samtools view -H data/NA20538.bam ++ head -n 1 ++ samtools view data/NA20538.bam +ERR003814.1408899 163 1 19999970 23 37M = 20000147 213 GGTGGGTGGATCACCTGAGATCGGGAGTTTGAGACTA <=@A@??@=@A@A>@BAA@ABA:>@<>=BBB9@@2B3 X0:i:1 X1:i:20 MD:Z:37 RG:Z:ERR003814 AM:i:0 NM:i:0 SM:i:23 MQ:i:29 XT:A:U BQ:Z:@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ ++ samtools sort -o data/NA20538_sorted.bam data/NA20538.bam ++ samtools index data/NA20538_sorted.bam ++ bcftools view -h data/1kg.bcf +##fileformat=VCFv4.2 +##FILTER= +##samtoolsVersion=1.2-29-gca44680+htslib-1.2.1-50-g66e5ef4 +##samtoolsCommand=samtools mpileup -gu -t DP -I -f ref/human/GRCh37/hs37d5.fa -b bams.chr20.list +##reference=file:///ref/human/GRCh37/hs37d5.fa +##bcftools_callVersion=1.2-208-gcdcf7e1+htslib-1.2.1-158-g6876a07 +##bcftools_callCommand=call -mv -Oz -o chr20.dp.vcf.gz +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##bcftools_viewVersion=1.10.2+htslib-1.10.2 +##bcftools_viewCommand=view -h data/1kg.bcf; Date=Sun Oct 4 11:31:01 2020 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097 HG00099 HG00100 HG00101 HG00102 HG00103 HG00105 HG00106 HG00107 HG00108 HG00109 HG00110 HG00111 HG00112 HG00113 HG00114 HG00115 HG00116 HG00117 HG00118 HG00119 HG00120 HG00121 HG00122 HG00123 HG00124 HG00125 HG00126 HG00127 HG00128 HG00129 HG00130 HG00131 HG00132 HG00133 HG00136 HG00137 HG00138 HG00139 HG00140 HG00141 HG00142 HG00143 HG00145 HG00146 HG00148 HG00149 HG00150 HG00151 ++ bcftools index data/1kg.bcf ++ bcftools view -H -r 20:24042765-24043073 data/1kg.bcf +20 24042765 . A T 445 . AN=100;AC=5;AC_Het=5;AC_Hom=0;AC_Hemi=0 GT:PL:DP 0/1:42,3,0:1 0/0:0,18,166:6 0/0:0,27,255:9 0/1:158,0,93:10 0/1:121,0,87:7 0/0:0,15,166:5 0/0:0,6,100:2 0/0:0,39,255:13 0/0:0,15,155:5 0/0:0,21,215:7 0/0:0,12,138:4 0/0:0,24,212:8 0/0:0,12,127:4 0/0:0,24,209:8 0/0:0,15,185:5 0/0:0,27,255:9 0/0:0,9,110:3 0/0:0,9,95:3 0/0:0,18,165:6 0/0:0,24,216:8 0/0:0,27,247:9 0/0:0,3,30:1 0/0:0,9,77:3 0/0:0,12,119:4 0/0:0,21,201:7 0/0:0,12,127:4 0/0:0,45,255:15 0/0:0,3,32:1 0/0:0,30,255:10 0/0:0,21,216:7 0/0:0,15,162:5 0/1:53,0,130:6 0/0:0,12,142:4 0/0:0,12,114:4 0/0:0,33,255:11 0/0:0,39,255:13 0/0:0,15,174:5 0/0:0,12,120:4 0/0:0,24,228:8 0/0:0,30,255:10 0/1:166,0,42:8 0/0:0,27,248:9 0/0:0,6,78:2 0/0:0,45,255:15 0/0:0,9,99:3 0/0:0,15,143:5 0/0:0,33,255:11 0/0:0,15,158:5 0/0:0,9,111:3 0/0:0,12,130:4 +20 24042793 . G A 734 . AN=100;AC=7;AC_Het=7;AC_Hom=0;AC_Hemi=0 GT:PL:DP 0/1:35,3,0:1 0/0:0,15,151:5 0/0:0,36,255:12 0/1:176,0,138:12 0/1:120,0,87:7 0/0:0,9,106:3 0/0:0,12,166:4 0/0:0,30,255:10 0/0:0,18,195:6 0/0:0,27,228:9 0/0:0,9,108:3 0/0:0,24,223:8 0/0:0,21,197:7 0/0:0,18,190:6 0/0:0,18,191:6 0/0:0,42,255:14 0/0:0,21,184:7 0/0:0,15,153:5 0/0:0,12,106:4 0/0:0,18,183:6 0/0:0,21,203:7 0/0:0,6,66:2 0/0:0,12,122:4 0/0:0,12,112:4 0/0:0,21,212:7 0/0:0,21,196:7 0/0:0,45,255:15 0/1:40,3,0:1 0/0:0,36,255:12 0/0:0,12,158:4 0/0:0,12,117:4 0/1:72,0,63:5 0/0:0,12,138:4 0/0:0,12,130:4 0/0:0,21,208:7 0/0:0,33,246:11 0/0:0,9,107:3 0/0:0,15,151:5 0/0:0,39,255:13 0/0:0,42,255:14 0/1:185,0,169:14 0/0:0,21,223:7 0/0:0,3,38:1 0/1:223,0,106:13 0/0:0,15,168:5 0/0:0,9,94:3 0/0:0,39,255:13 0/0:0,18,189:6 0/0:0,6,82:2 0/0:0,12,136:4 +20 24042907 . G A 311 . AN=98;AC=5;AC_Het=5;AC_Hom=0;AC_Hemi=0 GT:PL:DP 0/0:0,15,157:5 0/0:0,24,232:8 0/0:0,21,207:7 0/1:174,0,234:18 0/1:61,0,115:7 0/0:0,21,192:7 0/0:0,15,215:5 0/0:0,30,255:10 0/0:0,6,52:2 0/0:0,24,230:8 0/0:0,3,32:1 0/0:0,33,255:11 0/0:0,21,206:7 0/0:0,9,102:3 0/0:0,15,170:5 0/0:0,27,240:9 0/0:0,3,24:1 0/0:0,9,111:3 0/0:0,9,99:3 0/0:0,33,255:11 0/0:0,24,248:8 0/0:0,15,156:5 0/0:0,6,57:2 0/0:0,36,255:12 0/0:0,24,227:8 0/0:0,24,180:8 0/0:0,30,239:10 0/1:52,0,40:4 0/0:0,24,235:8 0/0:0,21,211:7 0/0:0,9,114:3 0/1:50,0,71:4 0/0:0,9,100:3 0/0:0,24,205:8 0/0:0,33,255:11 0/0:0,18,168:6 0/0:0,3,36:1 ./.:0,0,0:0 0/0:0,24,204:8 0/0:0,27,237:9 0/1:66,0,115:8 0/0:0,12,111:4 0/0:0,6,62:2 0/0:0,24,217:8 0/0:0,18,197:6 0/0:0,15,140:5 0/0:0,33,255:11 0/0:0,18,176:6 0/0:0,3,36:1 0/0:0,6,56:2 +20 24043073 . G A 824 . AN=100;AC=10;AC_Het=8;AC_Hom=2;AC_Hemi=0 GT:PL:DP 0/0:0,18,207:6 0/0:0,42,255:14 0/0:0,30,255:10 0/0:0,48,255:16 0/0:0,15,165:5 0/0:0,12,124:4 0/0:0,21,233:7 0/0:0,18,171:6 0/0:0,9,90:3 0/0:0,27,248:9 0/0:0,9,83:3 0/1:95,0,78:6 0/0:0,24,203:8 0/0:0,15,173:5 0/0:0,9,99:3 0/1:95,0,97:6 0/0:0,6,52:2 0/1:104,0,106:8 0/1:102,0,44:5 0/0:0,12,129:4 0/0:0,33,255:11 0/1:32,3,0:1 0/0:0,12,118:4 1/1:193,24,0:8 0/0:0,18,173:6 0/0:0,6,65:2 0/0:0,30,207:10 0/0:0,9,89:3 0/0:0,12,135:4 0/0:0,6,71:2 0/0:0,27,255:9 0/0:0,9,102:3 0/1:118,0,72:7 0/0:0,6,63:2 0/0:0,24,201:8 0/0:0,21,169:7 0/0:0,9,82:3 0/0:0,3,23:1 0/0:0,9,92:3 0/1:67,0,25:3 0/0:0,21,186:7 0/0:0,15,153:5 0/0:0,6,63:2 0/0:0,27,251:9 0/0:0,9,109:3 0/0:0,30,240:10 0/1:156,0,144:13 0/0:0,18,196:6 0/0:0,6,67:2 0/0:0,3,39:1 ++ bcftools query -l data/1kg.bcf +HG00096 +HG00097 +HG00099 +HG00100 +HG00101 +HG00102 +HG00103 +HG00105 +HG00106 +HG00107 +HG00108 +HG00109 +HG00110 +HG00111 +HG00112 +HG00113 +HG00114 +HG00115 +HG00116 +HG00117 +HG00118 +HG00119 +HG00120 +HG00121 +HG00122 +HG00123 +HG00124 +HG00125 +HG00126 +HG00127 +HG00128 +HG00129 +HG00130 +HG00131 +HG00132 +HG00133 +HG00136 +HG00137 +HG00138 +HG00139 +HG00140 +HG00141 +HG00142 +HG00143 +HG00145 +HG00146 +HG00148 +HG00149 +HG00150 +HG00151 ++ head -n 50 ++ bcftools view -s HG00131 data/1kg.bcf +##fileformat=VCFv4.2 +##FILTER= +##samtoolsVersion=1.2-29-gca44680+htslib-1.2.1-50-g66e5ef4 +##samtoolsCommand=samtools mpileup -gu -t DP -I -f ref/human/GRCh37/hs37d5.fa -b bams.chr20.list +##reference=file:///ref/human/GRCh37/hs37d5.fa +##bcftools_callVersion=1.2-208-gcdcf7e1+htslib-1.2.1-158-g6876a07 +##bcftools_callCommand=call -mv -Oz -o chr20.dp.vcf.gz +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##bcftools_viewVersion=1.10.2+htslib-1.10.2 +##bcftools_viewCommand=view -s HG00131 data/1kg.bcf; Date=Sun Oct 4 11:31:01 2020 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00131 +20 23946362 . G A 84 . AN=2;AC=0;AC_Het=2;AC_Hom=0;AC_Hemi=0 GT:PL:DP 0/0:0,12,95:4 +20 23946710 . C T 999 . AN=2;AC=2;AC_Het=4;AC_Hom=90;AC_Hemi=0 GT:PL:DP 1/1:76,9,0:3 +20 23947133 . C T 118 . AN=2;AC=0;AC_Het=1;AC_Hom=0;AC_Hemi=0 GT:PL:DP 0/0:0,9,87:3 +20 23947416 . C T 537 . AN=2;AC=0;AC_Het=4;AC_Hom=0;AC_Hemi=0 GT:PL:DP 0/0:0,30,169:10 +20 23947451 . T G 32.1879 . AN=2;AC=0;AC_Het=3;AC_Hom=0;AC_Hemi=0 GT:PL:DP 0/0:0,30,225:10 +20 23947981 . T C 17.4338 . AN=2;AC=0;AC_Het=1;AC_Hom=0;AC_Hemi=0 GT:PL:DP 0/0:0,9,81:3 +20 23948049 . A G 999 . AN=2;AC=2;AC_Het=1;AC_Hom=96;AC_Hemi=0 GT:PL:DP 1/1:187,30,0:10 ++ head ++ bcftools query '-f%POS\t%REF\t%ALT\n' -s HG00131 data/1kg.bcf +23946362 G A +23946710 C T +23947133 C T +23947416 C T +23947451 T G +23947981 T C +23948049 A G +23948150 G A +23948593 G A +23948726 C T ++ head ++ bcftools query '-f%CHROM\t%POS\n' -i 'AC[0]>2' data/1kg.bcf +20 23946710 +20 23947416 +20 23947451 +20 23948049 +20 23948593 +20 23948726 +20 23949020 +20 23949210 +20 23949344 +20 23949583 ++ samtools view -h data/NA20538.bam ++ head data/NA20538.sam +@HD VN:1.0 GO:none SO:coordinate +@SQ SN:1 LN:249250621 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:1b22b98cdeb4a9304cb5d48026a85128 SP:Human +@SQ SN:2 LN:243199373 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:a0d9851da00400dec1098a9255ac712e SP:Human +@SQ SN:3 LN:198022430 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:fdfd811849cc2fadebc929bb925902e5 SP:Human +@SQ SN:4 LN:191154276 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:23dccd106897542ad87d2765d28a19a1 SP:Human +@SQ SN:5 LN:180915260 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:0740173db9ffd264d728f32784845cd7 SP:Human +@SQ SN:6 LN:171115067 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:1d3a93a248d92a729ee764823acbbc6b SP:Human +@SQ SN:7 LN:159138663 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:618366e953d6aaad97dbe4777c29375e SP:Human +@SQ SN:8 LN:146364022 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:96f514a9929e410c6651697bded59aec SP:Human +@SQ SN:9 LN:141213431 AS:NCBI37 UR:ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz M5:3e273117f15e0a400f01055d9f393768 SP:Human ++ samtools view -b data/NA20538.sam ++ samtools view -C -T data/Saccharomyces_cerevisiae.EF4.68.dna.toplevel.fa -o data/yeast.cram data/yeast.bam +[samfaipath] build FASTA index... ++ ls -l data +total 464120 +-rwxr-xr-x 1 manager manager 94866319 Oct 4 10:56 13681_1#18_1.fastq.gz +-rwxr-xr-x 1 manager manager 97223945 Oct 4 10:56 13681_1#18_2.fastq.gz +-rw-r--r-- 1 manager manager 1935292 Sep 25 21:02 1kg.bcf +-rw-r--r-- 1 manager manager 1647 Oct 4 11:31 1kg.bcf.csi +drwxr-xr-x 3 manager manager 4096 Sep 25 21:02 60A_Sc_DBVPG6044 +-rwxr-xr-x 1 manager manager 600 Sep 25 21:02 align.sh +-rw-r--r-- 1 manager manager 1413 Oct 4 10:56 example.fasta +-rw-r--r-- 1 manager manager 1197 Oct 4 10:56 example.fastq +-rw-r--r-- 1 manager manager 1673 Oct 4 10:56 example.sam +drwxr-xr-x 2 manager manager 4096 Oct 4 10:56 lane1 +-rw-r--r-- 1 manager manager 36178402 Oct 4 10:56 lane1.sorted.bam +-rw-r--r-- 1 manager manager 11184 Oct 4 10:56 lane1.sorted.bam.bai +-rw-r--r-- 1 manager manager 26273935 Oct 4 11:31 NA20538_2.bam +-rw-r--r-- 1 manager manager 26146846 Sep 25 21:02 NA20538.bam +-rw-r--r-- 1 manager manager 99669349 Oct 4 11:31 NA20538.sam +-rw-r--r-- 1 manager manager 26273907 Oct 4 11:31 NA20538_sorted.bam +-rw-r--r-- 1 manager manager 24304 Oct 4 11:31 NA20538_sorted.bam.bai +-rw-r--r-- 1 manager manager 12360636 Sep 25 21:02 Saccharomyces_cerevisiae.EF4.68.dna.toplevel.fa +-rw-r--r-- 1 manager manager 415 Oct 4 11:31 Saccharomyces_cerevisiae.EF4.68.dna.toplevel.fa.fai +-rw-r--r-- 1 manager manager 35619221 Sep 25 21:02 yeast.bam +-rw-r--r-- 1 manager manager 18618953 Oct 4 11:31 yeast.cram ++ picard FastqToSam F1=data/13681_1#18_1.fastq.gz F2=data/13681_1#18_2.fastq.gz O=data/13681_1#18.sam SM=13681_1#18 +INFO 2020-10-04 11:31:05 FastqToSam + +********** NOTE: Picard's command line syntax is changing. +********** +********** For more information, please see: +********** https://github.com/broadinstitute/picard/wiki/Command-Line-Syntax-Transition-For-Users-(Pre-Transition) +********** +********** The command line looks like this in the new syntax: +********** +********** FastqToSam -F1 data/13681_1#18_1.fastq.gz -F2 data/13681_1#18_2.fastq.gz -O data/13681_1#18.sam -SM 13681_1#18 +********** + + +11:31:06.066 INFO NativeLibraryLoader - Loading libgkl_compression.so from jar:file:/home/manager/miniconda/share/picard-2.23.4-0/picard.jar!/com/intel/gkl/native/libgkl_compression.so +[Sun Oct 04 11:31:06 BST 2020] FastqToSam FASTQ=data/13681_1#18_1.fastq.gz FASTQ2=data/13681_1#18_2.fastq.gz OUTPUT=data/13681_1#18.sam SAMPLE_NAME=13681_1#18 USE_SEQUENTIAL_FASTQS=false READ_GROUP_NAME=A SORT_ORDER=queryname MIN_Q=0 MAX_Q=93 STRIP_UNPAIRED_MATE_NUMBER=false ALLOW_AND_IGNORE_EMPTY_LINES=false VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false +[Sun Oct 04 11:31:06 BST 2020] Executing as manager@NGSBio2020 on Linux 5.4.0-48-generic amd64; OpenJDK 64-Bit Server VM 11.0.8-internal+0-adhoc..src; Deflater: Intel; Inflater: Intel; Provider GCS is not available; Picard version: 2.23.4 +INFO 2020-10-04 11:31:06 FastqToSam Auto-detected quality format as: Standard. +INFO 2020-10-04 11:31:10 FastqToSam Processed 1,000,000 records. Elapsed time: 00:00:04s. Time for last 1,000,000: 4s. Last read position: */* +INFO 2020-10-04 11:31:16 FastqToSam Processed 2,000,000 records. Elapsed time: 00:00:09s. Time for last 1,000,000: 5s. Last read position: */* +INFO 2020-10-04 11:31:20 FastqToSam Processed 1313527 fastq reads +[Sun Oct 04 11:31:26 BST 2020] picard.sam.FastqToSam done. Elapsed time: 0.35 minutes. +Runtime.totalMemory()=519110656 ++ samtools collate data/yeast.cram data/yeast.collated +[W::find_file_url] Failed to read reference "https://www.ebi.ac.uk/ena/cram/md5/6681ac2f62509cfc220d78751b8dc524": Input/output error ++ samtools fastq -1 data/yeast.collated_1.fastq -2 data/yeast.collated_2.fastq data/yeast.collated.bam +[M::bam2fq_mainloop] discarded 0 singletons +[M::bam2fq_mainloop] processed 397506 reads ++ bcftools view -O z -o data/1kg.vcf.gz data/1kg.bcf ++ ls -lrt data +total 1271476 +drwxr-xr-x 3 manager manager 4096 Sep 25 21:02 60A_Sc_DBVPG6044 +-rw-r--r-- 1 manager manager 1935292 Sep 25 21:02 1kg.bcf +-rw-r--r-- 1 manager manager 26146846 Sep 25 21:02 NA20538.bam +-rw-r--r-- 1 manager manager 12360636 Sep 25 21:02 Saccharomyces_cerevisiae.EF4.68.dna.toplevel.fa +-rwxr-xr-x 1 manager manager 600 Sep 25 21:02 align.sh +-rw-r--r-- 1 manager manager 35619221 Sep 25 21:02 yeast.bam +-rwxr-xr-x 1 manager manager 94866319 Oct 4 10:56 13681_1#18_1.fastq.gz +-rw-r--r-- 1 manager manager 1673 Oct 4 10:56 example.sam +-rw-r--r-- 1 manager manager 1197 Oct 4 10:56 example.fastq +-rw-r--r-- 1 manager manager 1413 Oct 4 10:56 example.fasta +-rwxr-xr-x 1 manager manager 97223945 Oct 4 10:56 13681_1#18_2.fastq.gz +-rw-r--r-- 1 manager manager 11184 Oct 4 10:56 lane1.sorted.bam.bai +-rw-r--r-- 1 manager manager 36178402 Oct 4 10:56 lane1.sorted.bam +drwxr-xr-x 2 manager manager 4096 Oct 4 10:56 lane1 +-rw-r--r-- 1 manager manager 26273907 Oct 4 11:31 NA20538_sorted.bam +-rw-r--r-- 1 manager manager 24304 Oct 4 11:31 NA20538_sorted.bam.bai +-rw-r--r-- 1 manager manager 1647 Oct 4 11:31 1kg.bcf.csi +-rw-r--r-- 1 manager manager 99669349 Oct 4 11:31 NA20538.sam +-rw-r--r-- 1 manager manager 26273935 Oct 4 11:31 NA20538_2.bam +-rw-r--r-- 1 manager manager 415 Oct 4 11:31 Saccharomyces_cerevisiae.EF4.68.dna.toplevel.fa.fai +-rw-r--r-- 1 manager manager 18618953 Oct 4 11:31 yeast.cram +-rw-r--r-- 1 manager manager 680410400 Oct 4 11:31 13681_1#18.sam +-rw-r--r-- 1 manager manager 45374218 Oct 4 11:31 yeast.collated.bam +-rw-r--r-- 1 manager manager 49524074 Oct 4 11:31 yeast.collated_1.fastq +-rw-r--r-- 1 manager manager 49524074 Oct 4 11:31 yeast.collated_2.fastq +-rw-r--r-- 1 manager manager 1889651 Oct 4 11:31 1kg.vcf.gz ++ bcftools view -O b -o data/1kg_2.bcf data/1kg.vcf.gz ++ samtools stats -F SECONDARY data/lane1.sorted.bam ++ head -n 47 data/lane1.sorted.bam.bchk +# This file was produced by samtools stats (1.10+htslib-1.10.2) and can be plotted using plot-bamstats +# This file contains statistics for all reads. +# The command line was: stats -F SECONDARY data/lane1.sorted.bam +# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities +# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow) +CHK 77e527fe 79a53d84 2332f404 +# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part. +SN raw total sequences: 400252 +SN filtered sequences: 2746 +SN sequences: 397506 +SN is sorted: 1 +SN 1st fragments: 198753 +SN last fragments: 198753 +SN reads mapped: 303036 +SN reads mapped and paired: 291938 # paired-end technology bit set + both mates mapped +SN reads unmapped: 94470 +SN reads properly paired: 282478 # proper-pair bit set +SN reads paired: 397506 # paired-end technology bit set +SN reads duplicated: 0 # PCR or optical duplicate bit set +SN reads MQ0: 23803 # mapped and MQ=0 +SN reads QC failed: 0 +SN non-primary alignments: 0 +SN total length: 42930648 # ignores clipping +SN total first fragment length: 21465324 # ignores clipping +SN total last fragment length: 21465324 # ignores clipping +SN bases mapped: 32727888 # ignores clipping +SN bases mapped (cigar): 31886194 # more accurate +SN bases trimmed: 0 +SN bases duplicated: 0 +SN mismatches: 371298 # from NM fields +SN error rate: 1.164448e-02 # mismatches / bases mapped (cigar) +SN average length: 108 +SN average first fragment length: 108 +SN average last fragment length: 108 +SN maximum length: 108 +SN maximum first fragment length: 108 +SN maximum last fragment length: 108 +SN average quality: 29.4 +SN insert size average: 275.9 +SN insert size standard deviation: 47.7 +SN inward oriented pairs: 142274 +SN outward oriented pairs: 586 +SN pairs with other orientation: 874 +SN pairs on different chromosomes: 2235 +SN percentage of properly paired reads (%): 71.1 +# First Fragment Qualities. Use `grep ^FFQ | cut -f 2-` to extract this part. +# Columns correspond to qualities and rows to cycles. First column is the cycle number. ++ grep Use ++ grep '^#' data/lane1.sorted.bam.bchk +# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part. +# First Fragment Qualities. Use `grep ^FFQ | cut -f 2-` to extract this part. +# Last Fragment Qualities. Use `grep ^LFQ | cut -f 2-` to extract this part. +# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part. +# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part. +# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%]; and N and O counts as a percentage of all A/C/G/T bases [%] +# ACGT content per cycle for first fragments. Use `grep ^FBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%]; and N and O counts as a percentage of all A/C/G/T bases [%] +# ACGT raw counters for first fragments. Use `grep ^FTC | cut -f 2-` to extract this part. The columns are: A,C,G,T,N base counters +# ACGT content per cycle for last fragments. Use `grep ^LBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%]; and N and O counts as a percentage of all A/C/G/T bases [%] +# ACGT raw counters for last fragments. Use `grep ^LTC | cut -f 2-` to extract this part. The columns are: A,C,G,T,N base counters +# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs +# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count +# Read lengths - first fragments. Use `grep ^FRL | cut -f 2-` to extract this part. The columns are: read length, count +# Read lengths - last fragments. Use `grep ^LRL | cut -f 2-` to extract this part. The columns are: read length, count +# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions +# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev) +# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part. +# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile ++ plot-bamstats -p data/lane1-plots/ data/lane1.sorted.bam.bchk diff --git a/tests/test_data_formats.sh b/tests/test_data_formats.sh index 642a155..582a454 100644 --- a/tests/test_data_formats.sh +++ b/tests/test_data_formats.sh @@ -1,33 +1,50 @@ -cd /home/manager/course_data/data_formats/data - -#samtools view -H NA20538.bam | less -S -#samtools view -H NA20538.bam -samtools view -H NA20538.bam | grep -c ^@RG -samtools view -H NA20538.bam | grep ^@PG | less -S -samtools view -H NA20538.bam | cut -f1,4 - -samtools view NA20538.bam | head -1 | cut -f1 -samtools view NA20538.bam | head -1 | cut -f3,4 -samtools view NA20538.bam | head -1 | cut -f5 -samtools view -C -T Saccaromyces_cerevisiae.EF4.68.dna.toplevel.fa -o yeast.cram yeast.bam -ls -lh yeast.bam yeast.cram -bcftools -bcftools view -bcftools view -h 1kg.bcf | less -bcftools view -O z -o 1kg.vcf.gz 1kg.bcf -bcftools index 1kg.bcf -bcftools view -H -r 20:24042765-24043073 1kg.bcf | less -S -bcftools query -l 1kg.bcf | wc -l -bcftools query -r 20:24019472 -s HG00107,HG00108 -f '%POS [ %GT]\n' 1kg.bcf -bcftools query -i 'AC>10' -f '%POS\n' 1kg | wc -l -bcftools query -s HG00107 -i 'FORMAT/DP>10 & FORMAT/GT="alt"' -f '%POS [%GT %DP]\n' 1kg.bcf | head -head 60A_Sc_DBVPG6044/lane1/s_7_1.fastq | grep ^@ -head 60A_Sc_DBVPG6044/lane1/s_7_2.fastq | grep ^@ -wc -l 60A_Sc_DBVPG6044/lane1/*.fastq -./align.sh -grep ^SN lane*.sorted.bam.bchk | awk -F'\t' '$2=="raw rotal sequences:"' -grep ^SN lane*.sorted.bam.bchk | awk -F'\t' '$2=="reads mapped:"' -grep ^SN lane*.sorted.bam.bchk | awk -F'\t' '$2=="pairs on different cromosomes:"' -samtools stats -F SECONDARY lane1.sorted.bam > lane1.sorted.bam.bchk -plot-bamstats -p lane1-plots/ lane1.sorted.bam.bchk -firefox lane1-plots/*.html & +#!/bin/bash +set -e +set -x + +# /home/manager/course_data/data_formats/practical/Notebooks/index.ipynb +cd ~/course_data/data_formats/ +samtools --help +bcftools --help +#picard -h # returns non-zero status, commenting out + + +# /home/manager/course_data/data_formats/practical/Notebooks/formats.ipynb +pwd +perl -e 'printf "%d\n",ord("A")-33;' +cat data/example.sam +samtools view -H data/NA20538.bam +samtools view data/NA20538.bam | head -n 1 +samtools sort -o data/NA20538_sorted.bam data/NA20538.bam +samtools index data/NA20538_sorted.bam +bcftools view -h data/1kg.bcf +bcftools index data/1kg.bcf +bcftools view -H -r 20:24042765-24043073 data/1kg.bcf +#bcftools query -h # non-zero status +bcftools query -l data/1kg.bcf +bcftools view -s HG00131 data/1kg.bcf | head -n 50 +bcftools query -f'%POS\t%REF\t%ALT\n' -s HG00131 data/1kg.bcf | head +bcftools query -f'%CHROM\t%POS\n' -i 'AC[0]>2' data/1kg.bcf | head + + +# /home/manager/course_data/data_formats/practical/Notebooks/conversion.ipynb +samtools view -h data/NA20538.bam > data/NA20538.sam +head data/NA20538.sam +samtools view -b data/NA20538.sam > data/NA20538_2.bam +samtools view -C -T data/Saccharomyces_cerevisiae.EF4.68.dna.toplevel.fa -o data/yeast.cram data/yeast.bam +ls -l data +picard FastqToSam F1=data/13681_1#18_1.fastq.gz F2=data/13681_1#18_2.fastq.gz O=data/13681_1#18.sam SM=13681_1#18 +#picard FastqToSam -h # non-zero status +samtools collate data/yeast.cram data/yeast.collated +samtools fastq -1 data/yeast.collated_1.fastq -2 data/yeast.collated_2.fastq data/yeast.collated.bam +bcftools view -O z -o data/1kg.vcf.gz data/1kg.bcf +ls -lrt data +bcftools view -O b -o data/1kg_2.bcf data/1kg.vcf.gz + + +# /home/manager/course_data/data_formats/practical/Notebooks/assessment.ipynb +samtools stats -F SECONDARY data/lane1.sorted.bam > data/lane1.sorted.bam.bchk +head -n 47 data/lane1.sorted.bam.bchk +grep ^'#' data/lane1.sorted.bam.bchk | grep 'Use' +plot-bamstats -p data/lane1-plots/ data/lane1.sorted.bam.bchk + diff --git a/tests/test_variant_calling.log b/tests/test_variant_calling.log new file mode 100644 index 0000000..28698bc --- /dev/null +++ b/tests/test_variant_calling.log @@ -0,0 +1,332 @@ ++ cd /home/manager/course_data/variant_calling/data ++ samtools --help + +Program: samtools (Tools for alignments in the SAM format) +Version: 1.10 (using htslib 1.10.2) + +Usage: samtools [options] + +Commands: + -- Indexing + dict create a sequence dictionary file + faidx index/extract FASTA + fqidx index/extract FASTQ + index index alignment + + -- Editing + calmd recalculate MD/NM tags and '=' bases + fixmate fix mate information + reheader replace BAM header + targetcut cut fosmid regions (for fosmid pool only) + addreplacerg adds or replaces RG tags + markdup mark duplicates + + -- File operations + collate shuffle and group alignments by name + cat concatenate BAMs + merge merge sorted alignments + mpileup multi-way pileup + sort sort alignment file + split splits a file by read group + quickcheck quickly check if SAM/BAM/CRAM file appears intact + fastq converts a BAM to a FASTQ + fasta converts a BAM to a FASTA + + -- Statistics + bedcov read depth per BED region + coverage alignment depth and percent coverage + depth compute the depth + flagstat simple stats + idxstats BAM index stats + phase phase heterozygotes + stats generate stats (former bamcheck) + + -- Viewing + flags explain BAM flags + tview text alignment viewer + view SAM<->BAM<->CRAM conversion + depad convert padded BAM to unpadded BAM + ++ bcftools --help + +Program: bcftools (Tools for variant calling and manipulating VCFs and BCFs) +License: GNU GPLv3+, due to use of the GNU Scientific Library +Version: 1.10.2 (using htslib 1.10.2) + +Usage: bcftools [--version|--version-only] [--help] + +Commands: + + -- Indexing + index index VCF/BCF files + + -- VCF/BCF manipulation + annotate annotate and edit VCF/BCF files + concat concatenate VCF/BCF files from the same set of samples + convert convert VCF/BCF files to different formats and back + isec intersections of VCF/BCF files + merge merge VCF/BCF files files from non-overlapping sample sets + norm left-align and normalize indels + plugin user-defined plugins + query transform VCF/BCF into user-defined formats + reheader modify VCF/BCF header, change sample names + sort sort VCF/BCF file + view VCF/BCF conversion, view, subset and filter VCF/BCF files + + -- VCF/BCF analysis + call SNP/indel calling + consensus create consensus sequence by applying VCF variants + cnv HMM CNV calling + csq call variation consequences + filter filter VCF/BCF files using fixed thresholds + gtcheck check sample concordance, detect sample swaps and contamination + mpileup multi-way pileup producing genotype likelihoods + polysomy detect number of chromosomal copies + roh identify runs of autozygosity (HMM) + stats produce VCF/BCF stats + + Most commands accept VCF, bgzipped VCF, and BCF with the file type detected + automatically even when streaming from a pipe. Indexed VCF and BCF will work + in all situations. Un-indexed VCF and BCF and streams will work in most but + not all situations. + ++ samtools stats -r GRCm38_68.19.fa A_J.bam ++ samtools stats -r GRCm38_68.19.fa NZO.bam ++ plot-bamstats -s GRCm38_68.19.fa ++ plot-bamstats -r GRCm38_68.19.fa.gc -p A_J.graphs/ A_J.stats ++ plot-bamstats -r GRCm38_68.19.fa.gc -p NZO.graphs/ NZO.stats ++ head ++ samtools mpileup -f GRCm38_68.19.fa A_J.bam +[mpileup] 1 samples in 1 input files +19 9999902 G 1 ^], D +19 9999903 T 1 , B +19 9999904 G 0 * * +19 9999905 C 2 ,^], =@ +19 9999906 T 2 ,, 5H +19 9999907 G 3 ,,^], EG@ +19 9999908 C 3 ,,, DGB +19 9999909 C 3 ,,, CAE +19 9999910 T 4 ,,,^], @DE6 +19 9999911 G 4 ,,,, FDGE ++ head ++ bcftools mpileup -f GRCm38_68.19.fa A_J.bam +[mpileup] 1 samples in 1 input files +##fileformat=VCFv4.2 +##FILTER= +##bcftoolsVersion=1.10.2+htslib-1.10.2 +##bcftoolsCommand=mpileup -f GRCm38_68.19.fa A_J.bam +##reference=file://GRCm38_68.19.fa +##contig= +##contig= +##contig= +##contig= +##contig= +[mpileup] maximum number of reads per input file set to -d 250 ++ head ++ bcftools call -m ++ bcftools mpileup -f GRCm38_68.19.fa A_J.bam +Note: none of --samples-file, --ploidy or --ploidy-file given, assuming all sites are diploid +[mpileup] 1 samples in 1 input files +##fileformat=VCFv4.2 +##FILTER= +##bcftoolsVersion=1.10.2+htslib-1.10.2 +##bcftoolsCommand=mpileup -f GRCm38_68.19.fa A_J.bam +##reference=file://GRCm38_68.19.fa +##contig= +##contig= +##contig= +##contig= +##contig= +[mpileup] maximum number of reads per input file set to -d 250 ++ bcftools call -mv -o out.vcf ++ bcftools mpileup -a AD -f GRCm38_68.19.fa A_J.bam -Ou +Note: none of --samples-file, --ploidy or --ploidy-file given, assuming all sites are diploid +[mpileup] 1 samples in 1 input files +[mpileup] maximum number of reads per input file set to -d 250 ++ tail out.vcf +19 10995336 . T C 225 . DP=48;VDB=0.232734;SGB=-0.693147;MQSB=0.939086;MQ0F=0;AC=2;AN=2;DP4=0,0,26,20;MQ=59 GT:PL:AD 1/1:255,138,0:0,46 +19 10995346 . C A 225 . DP=50;VDB=0.278885;SGB=-0.693147;MQSB=0.867582;MQ0F=0;AC=2;AN=2;DP4=0,0,25,22;MQ=59 GT:PL:AD 1/1:255,141,0:0,47 +19 10995463 . A C 225 . DP=57;VDB=0.531446;SGB=-0.693147;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,27,28;MQ=60 GT:PL:AD 1/1:255,166,0:0,55 +19 10995508 . T C 225 . DP=51;VDB=0.110544;SGB=-0.693147;MQSB=0.971871;MQ0F=0;AC=2;AN=2;DP4=0,0,24,21;MQ=59 GT:PL:AD 1/1:255,135,0:0,45 +19 10995587 . T C 225 . DP=36;VDB=0.572009;SGB=-0.693139;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,13,23;MQ=60 GT:PL:AD 1/1:255,108,0:0,36 +19 10995605 . A AGCAG 178 . INDEL;IDV=23;IMF=0.741935;DP=31;VDB=0.363991;SGB=-0.691153;MQSB=1;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=6,7,6,12;MQ=60 GT:PL:AD 0/1:211,0,56:13,18 +19 10995938 . C T 225 . DP=55;VDB=0.800278;SGB=-0.693147;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,31,21;MQ=60 GT:PL:AD 1/1:255,157,0:0,52 +19 10996876 . C T 225 . DP=55;VDB=0.27595;SGB=-0.693147;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,31,23;MQ=60 GT:PL:AD 1/1:255,163,0:0,54 +19 10999808 . C A 225 . DP=58;VDB=0.350828;SGB=-0.693147;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,31,25;MQ=60 GT:PL:AD 1/1:255,169,0:0,56 +19 10999916 . T G 225 . DP=71;VDB=0.969235;SGB=-0.693147;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,35,35;MQ=60 GT:PL:AD 1/1:255,211,0:0,70 ++ head ++ bcftools query --format 'POS=%POS\n' out.vcf +POS=10000104 +POS=10000105 +POS=10000139 +POS=10000141 +POS=10000143 +POS=10001946 +POS=10001994 +POS=10002249 +POS=10002281 +POS=10002676 ++ head ++ bcftools query '-f%POS %REF,%ALT\n' out.vcf +10000104 TTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTC,TTCTCTCTCTCTCTCTCTCTCTC +10000105 T,G +10000139 T,A +10000141 T,A +10000143 T,A +10001946 A,G +10001994 A,G +10002249 C,T +10002281 TTGTATGTATGTATGTATGTATGTATGTATGTATGTAT,TTGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTAT +10002676 C,T ++ head ++ bcftools query '-f%POS %QUAL [%GT %AD] %REF %ALT\n' out.vcf +10000104 222 0/1 26,15 TTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTC TTCTCTCTCTCTCTCTCTCTCTC +10000105 104 1/1 0,14 T G +10000139 3.7766 1/1 0,2 T A +10000141 4.38466 1/1 0,2 T A +10000143 6.51248 1/1 0,2 T A +10001946 225 1/1 0,71 A G +10001994 225 1/1 0,66 A G +10002249 225 1/1 0,72 C T +10002281 222 0/1 50,22 TTGTATGTATGTATGTATGTATGTATGTATGTATGTAT TTGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTAT +10002676 225 1/1 0,34 C T ++ head ++ bcftools query '-f%POS %QUAL [%GT %AD] %REF %ALT\n' '-iQUAL>=30' out.vcf +10000104 222 0/1 26,15 TTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTC TTCTCTCTCTCTCTCTCTCTCTC +10000105 104 1/1 0,14 T G +10001946 225 1/1 0,71 A G +10001994 225 1/1 0,66 A G +10002249 225 1/1 0,72 C T +10002281 222 0/1 50,22 TTGTATGTATGTATGTATGTATGTATGTATGTATGTAT TTGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTAT +10002676 225 1/1 0,34 C T +10002682 225 1/1 0,28 C T +10002723 228 1/1 2,30 GCACACACACACACACAC GCACACACACACAC +10002846 185 0/1 25,9 GGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG GGTGTGTGTGTGTGTG ++ head ++ bcftools query '-f%POS %QUAL [%GT %AD] %REF %ALT\n' '-iQUAL>=30 && type="snp"' out.vcf +10000105 104 1/1 0,14 T G +10001946 225 1/1 0,71 A G +10001994 225 1/1 0,66 A G +10002249 225 1/1 0,72 C T +10002676 225 1/1 0,34 C T +10002682 225 1/1 0,28 C T +10003232 225 1/1 0,50 G A +10003248 225 1/1 0,56 T C +10003440 225 1/1 0,45 A C +10003441 225 1/1 0,44 G A ++ head ++ bcftools stats out.vcf +# This file was produced by bcftools stats (1.10.2+htslib-1.10.2) and can be plotted using plot-vcfstats. +# The command line was: bcftools stats out.vcf +# +# Definition of sets: +# ID [2]id [3]tab-separated file names +ID 0 out.vcf +# SN, Summary numbers: +# number of records .. number of data rows in the VCF +# number of no-ALTs .. reference-only sites, ALT is either "." or identical to REF +# number of SNPs .. number of rows with a SNP ++ grep TSTV ++ bcftools stats out.vcf +# TSTV, transitions/transversions: +# TSTV [2]id [3]ts [4]tv [5]ts/tv [6]ts (1st ALT) [7]tv (1st ALT) [8]ts/tv (1st ALT) +TSTV 0 1583 667 2.37 1583 667 2.37 ++ cut -f5 ++ grep TSTV ++ bcftools stats out.vcf +# TSTV, transitions/transversions: +[5]ts/tv +2.37 ++ bcftools filter -s LowQual '-iQUAL>=30 && AD[*:1]>=25' -g8 -G10 out.vcf -o out.flt.vcf ++ ls A_J.bam NZO.bam +A_J.bam +NZO.bam ++ bcftools call -mv -Ob -o multi.bcf ++ bcftools mpileup -a AD -f GRCm38_68.19.fa A_J.bam NZO.bam -Ou +Note: none of --samples-file, --ploidy or --ploidy-file given, assuming all sites are diploid +[mpileup] 2 samples in 2 input files +[mpileup] maximum number of reads per input file set to -d 250 ++ bcftools index multi.bcf ++ bcftools filter -s LowQual '-iQUAL>=30 && AD[*:1]>=25' -g8 -G10 multi.bcf -Ob -o multi.filt.bcf ++ bcftools index multi.filt.bcf ++ bcftools view -H -r 19:10001946 multi.filt.bcf +19 10001946 . A G 216 PASS DP=144;VDB=0.0854312;SGB=47.6573;RPB=0.820043;MQB=1;MQSB=1;BQB=0.650746;MQ0F=0;ICB=0.5;HOB=0.5;AC=2;AN=4;DP4=33,35,37,34;MQ=60 GT:PL:AD 1/1:255,214,0:0,71 0/0:0,205,255:68,0 ++ bcftools view -H -r 19:10072443 multi.filt.bcf +19 10072430 . ATAAATCTAAATCTAAA ATAAATCTAAA 52 LowQual INDEL;IDV=12;IMF=0.461538;DP=77;VDB=0.90634;SGB=4.00342;MQSB=1;MQ0F=0;ICB=0.3;HOB=0.125;AC=1;AN=4;DP4=36,33,2,6;MQ=60 GT:PL:AD 0/1:86,0,208:18,8 0/0:0,154,255:51,0 +19 10072443 . T A 46.1636 LowQual;SnpGap DP=80;VDB=0.000527421;SGB=2.63044;RPB=0.00100984;MQB=1;MQSB=1;BQB=0.00120386;MQ0F=0;ICB=0.3;HOB=0.125;AC=1;AN=4;DP4=34,22,2,4;MQ=60 GT:PL:AD 0/1:80,0,124:6,6 0/0:0,151,255:50,0 ++ bcftools csq -p m -f GRCm38_68.19.fa -g Mus_musculus.part.gff3.gz -Ob -o multi.filt.annot.bcf ++ bcftools view -i 'FILTER="PASS"' multi.filt.bcf +Parsing Mus_musculus.part.gff3.gz ... +Indexed 98 transcripts, 734 exons, 527 CDSs, 183 UTRs +Calling... ++ bcftools index multi.filt.annot.bcf ++ bcftools query -f %BCSQ -r 19:10088937 multi.filt.annot.bcf +missense|Fads2|ENSMUST00000025567|protein_coding|-|163V>163I|10088937C>T+ head ++ bcftools call -mv ++ bcftools mpileup -f GRCm38_68.19.fa A_J.bam +[mpileup] 1 samples in 1 input files +Note: none of --samples-file, --ploidy or --ploidy-file given, assuming all sites are diploid +##fileformat=VCFv4.2 +##FILTER= +##bcftoolsVersion=1.10.2+htslib-1.10.2 +##bcftoolsCommand=mpileup -f GRCm38_68.19.fa A_J.bam +##reference=file://GRCm38_68.19.fa +##contig= +##contig= +##contig= +##contig= +##contig= +[mpileup] maximum number of reads per input file set to -d 250 ++ head ++ bcftools query '-f%POS %QUAL [%GT %AD] %REF %ALT\n' '-iQUAL>=30 && type="snp" && AD[*:1]>=25' out.vcf +10001946 225 1/1 0,71 A G +10001994 225 1/1 0,66 A G +10002249 225 1/1 0,72 C T +10002676 225 1/1 0,34 C T +10002682 225 1/1 0,28 C T +10003232 225 1/1 0,50 G A +10003248 225 1/1 0,56 T C +10003440 225 1/1 0,45 A C +10003441 225 1/1 0,44 G A +10003529 225 1/1 0,58 T C ++ cut -f5 ++ grep TSTV ++ bcftools stats '-iQUAL>=30 && AD[*:1]>=25' out.vcf +# TSTV, transitions/transversions: +[5]ts/tv +2.50 ++ cut -f5 ++ grep TSTV ++ bcftools stats '-eQUAL>=30 && AD[*:1]>=25' out.vcf +# TSTV, transitions/transversions: +[5]ts/tv +1.75 ++ cut -f5 ++ grep TSTV ++ bcftools stats -i 'GT="het"' out.vcf +# TSTV, transitions/transversions: +[5]ts/tv +1.50 ++ bcftools norm -f GRCm38_68.19.fa out.flt.vcf -o out.flt.norm.vcf +Lines total/split/realigned/skipped: 2907/0/577/0 ++ bcftools call -mv -Ob -o multi.bcf ++ bcftools mpileup -a AD -f GRCm38_68.19.fa A_J.bam NZO.bam -Ou +Note: none of --samples-file, --ploidy or --ploidy-file given, assuming all sites are diploid +[mpileup] 2 samples in 2 input files +[mpileup] maximum number of reads per input file set to -d 250 ++ bcftools index multi.bcf ++ bcftools filter -s LowQual '-iQUAL>=30 && AD[*:1]>=25' -g8 -G10 multi.bcf -Ob -o multi.filt.bcf ++ bcftools index multi.filt.bcf ++ cut -f5 ++ grep TSTV ++ bcftools stats -e 'FILTER="PASS"' multi.filt.bcf +# TSTV, transitions/transversions: +[5]ts/tv +1.73 ++ bcftools view -H -r 19:10001946 multi.filt.bcf +19 10001946 . A G 216 PASS DP=144;VDB=0.0854312;SGB=47.6573;RPB=0.820043;MQB=1;MQSB=1;BQB=0.650746;MQ0F=0;ICB=0.5;HOB=0.5;AC=2;AN=4;DP4=33,35,37,34;MQ=60 GT:PL:AD 1/1:255,214,0:0,71 0/0:0,205,255:68,0 ++ bcftools view -H -r 19:10072443 multi.filt.bcf +19 10072430 . ATAAATCTAAATCTAAA ATAAATCTAAA 52 LowQual INDEL;IDV=12;IMF=0.461538;DP=77;VDB=0.90634;SGB=4.00342;MQSB=1;MQ0F=0;ICB=0.3;HOB=0.125;AC=1;AN=4;DP4=36,33,2,6;MQ=60 GT:PL:AD 0/1:86,0,208:18,8 0/0:0,154,255:51,0 +19 10072443 . T A 46.1636 LowQual;SnpGap DP=80;VDB=0.000527421;SGB=2.63044;RPB=0.00100984;MQB=1;MQSB=1;BQB=0.00120386;MQ0F=0;ICB=0.3;HOB=0.125;AC=1;AN=4;DP4=34,22,2,4;MQ=60 GT:PL:AD 0/1:80,0,124:6,6 0/0:0,151,255:50,0 ++ bcftools query -f %BCSQ -r 19:10088937 multi.filt.annot.bcf +missense|Fads2|ENSMUST00000025567|protein_coding|-|163V>163I|10088937C>T \ No newline at end of file diff --git a/tests/test_variant_calling.sh b/tests/test_variant_calling.sh index 2a12457..8a6ee1a 100644 --- a/tests/test_variant_calling.sh +++ b/tests/test_variant_calling.sh @@ -1,54 +1,72 @@ -##Introduction -cd /home/manager/course_data/variant_calling/data +#!/bin/bash +set -e +set -x + +# ../../variant_calling/practical/Notebooks/index.ipynb +cd ~/course_data/variant_calling/data samtools --help bcftools --help -igv & +#igv.sh + -##Calling variants -pwd -ls -lh +# ../../variant_calling/practical/Notebooks/variant-calling.ipynb +samtools stats -r GRCm38_68.19.fa A_J.bam > A_J.stats +samtools stats -r GRCm38_68.19.fa NZO.bam > NZO.stats +plot-bamstats -s GRCm38_68.19.fa > GRCm38_68.19.fa.gc +plot-bamstats -r GRCm38_68.19.fa.gc -p A_J.graphs/ A_J.stats +plot-bamstats -r GRCm38_68.19.fa.gc -p NZO.graphs/ NZO.stats samtools mpileup -f GRCm38_68.19.fa A_J.bam | head -samtools mpileup -f GRCm38_68.19.fa A_J.bam | grep 10001994 bcftools mpileup -f GRCm38_68.19.fa A_J.bam | head bcftools mpileup -f GRCm38_68.19.fa A_J.bam | bcftools call -m | head -bcftools mpileup -f GRCm38_68.19.fa A_J.bam | bcftools call -mv | head -bcftools mpileup -a ? +#bcftools mpileup -a ? bcftools mpileup -a AD -f GRCm38_68.19.fa A_J.bam -Ou | bcftools call -mv -o out.vcf -grep 10001994 out.vf -grep 10003649 out.vcf +tail out.vcf + -##Filtering Variants +# ../../variant_calling/practical/Notebooks/filtering.ipynb bcftools query --format 'POS=%POS\n' out.vcf | head bcftools query -f'%POS %REF,%ALT\n' out.vcf | head bcftools query -f'%POS %QUAL [%GT %AD] %REF %ALT\n' out.vcf | head bcftools query -f'%POS %QUAL [%GT %AD] %REF %ALT\n' -i'QUAL>=30' out.vcf | head -bcftools query -f'%POS %QUAL [%GT %AD] %REF %ALT\n' -i'QUAL>=30 && type="snp"' out.vcf | head -##Exercises -bcftools query -f'%POS %QUAL [%GT %AD] %REF %ALT\n' -i'QUAL>=30 && type="snp" && AD[*:1]>=25' out.vcf | head +bcftools query -f'%POS %QUAL [%GT %AD] %REF %ALT\n' -i'QUAL>=30 && type="snp"' out.vcf | head +bcftools stats out.vcf | head +bcftools stats out.vcf | grep TSTV bcftools stats out.vcf | grep TSTV | cut -f5 -bcftools stats -i'QUAL>=30 && AD[*:1]>=25' out.vcf | grep TSTV | cut -f5 -bcftools stats -e'QUAL>=30 && AD[*:1]>=25' out.vcf | grep TSTV | cut -f5 -bcftools stats -i 'GT="het"' out.vcf | grep TSTV | cut -f5 bcftools filter -s LowQual -i'QUAL>=30 && AD[*:1]>=25' -g8 -G10 out.vcf -o out.flt.vcf -bcftools norm -f GRCm38_68.19.fa out.flt.vcf -o out.flt.norm.vcf -##Multisample calling + +# ../../variant_calling/practical/Notebooks/multi-sample-calling.ipynb ls *.bam bcftools mpileup -a AD -f GRCm38_68.19.fa *.bam -Ou | bcftools call -mv -Ob -o multi.bcf bcftools index multi.bcf bcftools filter -s LowQual -i'QUAL>=30 && AD[*:1]>=25' -g8 -G10 multi.bcf -Ob -o multi.filt.bcf bcftools index multi.filt.bcf -#Exercises -bcftools stats multi.filt.bcf | grep TSTV | cut -f5 (raw calls) -bcftools stats -i 'FILTER="PASS"' multi.filt.bcf | grep TSTV | cut -f5 (only filtered set) -bcftools stats -e 'FILTER="PASS"' multi.filt.bcf | grep TSTV | cut -f5 -##Visualising variants -##Test manually +# ../../variant_calling/practical/Notebooks/visualisation.ipynb bcftools view -H -r 19:10001946 multi.filt.bcf bcftools view -H -r 19:10072443 multi.filt.bcf -##Annotating variants + +# ../../variant_calling/practical/Notebooks/annotation.ipynb bcftools view -i 'FILTER="PASS"' multi.filt.bcf | bcftools csq -p m -f GRCm38_68.19.fa -g Mus_musculus.part.gff3.gz -Ob -o multi.filt.annot.bcf bcftools index multi.filt.annot.bcf bcftools query -f '%BCSQ' -r 19:10088937 multi.filt.annot.bcf + + +# ../../variant_calling/practical/Notebooks/answers.ipynb +bcftools mpileup -f GRCm38_68.19.fa A_J.bam | bcftools call -mv | head +bcftools query -f'%POS %QUAL [%GT %AD] %REF %ALT\n' -i'QUAL>=30 && type="snp" && AD[*:1]>=25' out.vcf | head +bcftools stats -i'QUAL>=30 && AD[*:1]>=25' out.vcf | grep TSTV | cut -f5 +bcftools stats -e'QUAL>=30 && AD[*:1]>=25' out.vcf | grep TSTV | cut -f5 +bcftools stats -i 'GT="het"' out.vcf | grep TSTV | cut -f5 +bcftools norm -f GRCm38_68.19.fa out.flt.vcf -o out.flt.norm.vcf +bcftools mpileup -a AD -f GRCm38_68.19.fa *.bam -Ou | bcftools call -mv -Ob -o multi.bcf +bcftools index multi.bcf +bcftools filter -s LowQual -i'QUAL>=30 && AD[*:1]>=25' -g8 -G10 multi.bcf -Ob -o multi.filt.bcf +bcftools index multi.filt.bcf +bcftools stats -e 'FILTER="PASS"' multi.filt.bcf | grep TSTV | cut -f5 +bcftools view -H -r 19:10001946 multi.filt.bcf +bcftools view -H -r 19:10072443 multi.filt.bcf +bcftools query -f '%BCSQ' -r 19:10088937 multi.filt.annot.bcf + +