-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfilter_variants.py
136 lines (99 loc) · 5.64 KB
/
filter_variants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import vcf
# more info: https://github.com/jamescasbon/PyVCF/
import argparse, textwrap, os
# parser = argparse.ArgumentParser(description='Filter variants for: PTMs 95% (LOFTEE LoF=HC), missense CADD PHRED > 20, ExAC MAF NFE < 1%, repeat regions and monomorphic loci.')
parser = argparse.ArgumentParser(
formatter_class = argparse.RawDescriptionHelpFormatter,
description = textwrap.dedent('''\
Write a txt file with the variants filtered by the optional arguments:
- Tier1: PTMs 95% (LOFTEE LoF=HC)
- Tier2: missense CADD PHRED > 20
All variants are filtered by the following QC:
- ExAC MAF NFE < 1%
- removes variants falling in repeat regions (FILTER=repeat_region)
- removes monomorphic loci
'''))
parser.add_argument('input_vcf', help = 'input VCF or bgzipped VCF (can be relative or absolute path)', metavar = '<input_vcf>', nargs = 1)
parser.add_argument('-g', help = 'output variants only in this gene', metavar = 'GENE')
parser.add_argument('-t1', help = 'include Tier1 variants', action = 'store_true')
parser.add_argument('-t2', help = 'include Tier2 variants', action = 'store_true')
parser.add_argument('-vcf', help = 'Additionally, output in VCF format', action = 'store_true')
# parser.add_argument('output_vcf', metavar='o', nargs=1, help='output VCF (full path)')
# parser.add_argument('integers', metavar='N', type=int, nargs='+',
# help='an integer for the accumulator')
# parser.add_argument('--sum', dest='accumulate', action='store_const',
# const=sum, default=max,
# help='sum the integers (default: find the max)')
# print args.accumulate(args.integers)
args = parser.parse_args()
input_vcf = args.input_vcf[0]
t1 = args.t1
t2 = args.t2
g = args.g
output_in_vcf_format = args.vcf
if os.path.dirname(input_vcf) == '':
full_path = "."
else:
full_path = os.path.dirname(input_vcf)
if os.path.basename(input_vcf).endswith('vcf.gz'):
output_vcf = full_path + "/" + os.path.basename(input_vcf)[:-6] + "filtered.vcf"
output_txt = full_path + "/" + os.path.basename(input_vcf)[:-6] + "filtered.txt"
else:
output_vcf = full_path + "/" + os.path.basename(input_vcf)[:-3] + "filtered.vcf"
output_txt = full_path + "/" + os.path.basename(input_vcf)[:-3] + "filtered.txt"
vcf_reader = vcf.Reader(open(input_vcf, 'r'))
output_txt = open(output_txt, 'w')
output_txt.write("CHROM\tPOS\trsID\tREF\tALT\tFILTER\tTYPE\tIMPACT\tGENE\tCLIN_SIG\tLoF\tLoF_info\tExAC_MAF_NFE\tgnomAD_exomes_NFE\tCADD_PHRED\tREVEL\tHET_CARRIERS\tHOM_ALT_CARRIERS\n")
if output_in_vcf_format:
vcf_writer = vcf.Writer(open(output_vcf, 'w'), vcf_reader)
for record in vcf_reader:
if str(record.ALT[0]) == '*':
continue
mut_type = str(record.INFO['CSQ'][0]).split('|')[1]
impact = str(record.INFO['CSQ'][0]).split('|')[2]
gene = str(record.INFO['CSQ'][0]).split('|')[3]
clin_sig = "-" if "CLNSIG" not in record.INFO else str(record.INFO['CLNSIG'][0])
LoF = str(record.INFO['CSQ'][0]).split('|')[27] if str(record.INFO['CSQ'][0]).split('|')[27] != "" else "-"
LoF_info = str(record.INFO['CSQ'][0]).split('|')[30] if str(record.INFO['CSQ'][0]).split('|')[30] != "" else "-"
ExAC_AF_NFE = str(record.INFO['CSQ'][0]).split('|')[40] if str(record.INFO['CSQ'][0]).split('|')[40] != "" else "-"
CADD_PHRED = str(record.INFO['CSQ'][0]).split('|')[44] if str(record.INFO['CSQ'][0]).split('|')[44] != "" else "-"
REVEL = str(record.INFO['CSQ'][0]).split('|')[46] if str(record.INFO['CSQ'][0]).split('|')[46] != "" else "-"
gnomAD_exomes_AF_nfe = str(record.INFO['CSQ'][0]).split('|')[48] if str(record.INFO['CSQ'][0]).split('|')[48] != "" else "-"
# print record.get_hets()
# print record.get_hom_alts()
# print record.samples
# print record.num_het
# print record.num_hom_alt
# print record.num_hom_ref
# print record.is_snp
# print record.is_indel
# print record.is_monomorphic
l_het = ",".join([h.sample for h in record.get_hets()])
if l_het == "": l_het = "-"
l_hom_alts = ",".join([h.sample for h in record.get_hom_alts()])
if l_hom_alts == "": l_hom_alts = "-"
is_Tier1 = False
is_Tier2 = False
if impact == 'HIGH' and LoF == "HC":
is_Tier1 = True
elif impact == 'MODERATE' and mut_type.find('missense') != -1 and (CADD_PHRED == "-" or float(CADD_PHRED) > 20):
is_Tier2 = True
# check monomorphic
n = len(record.samples)
is_monomorphic = (n > 1) and (record.num_het == n or record.num_hom_ref == n or record.num_hom_alt == n)
# check repeated region
FILTER = ",".join(record.FILTER)
pass_filter = (ExAC_AF_NFE == "-" or float(ExAC_AF_NFE) <= 0.01) and FILTER == "" and not is_monomorphic
if not g is None:
pass_filter = pass_filter and (gene == g)
if pass_filter:
if (t1 and is_Tier1) or (t2 and is_Tier2) or (not t1 and not t2):
# print record.CHROM + '\t' + str(record.POS) + '\t' + str(record.ID) + '\t' + record.REF + '\t' + str(record.ALT[0]) + '\tFILTER=' + FILTER + '\tTYPE=' + mut_type + '\tIMPACT=' + impact + '\tGENE=' + gene + '\tCLIN_SIG=' + clin_sig + '\tLoF=' + LoF + '\tLoF_info=' + LoF_info + '\tExAC_MAF=' + ExAC_AF_NFE + '\tgnomAD_exomes_AF_nfe=' + gnomAD_exomes_AF_nfe + '\tCADD_PHRED=' + CADD_PHRED + '\tREVEL=' + REVEL + "\tHET=" + l_het + "\tHOM_ALT=" + l_hom_alts
output_txt.write(record.CHROM + '\t' + str(record.POS) + '\t' + str(record.ID) + '\t' + record.REF + '\t' + str(record.ALT[0]) + '\t' + FILTER + '\t' + mut_type + '\t' + impact + '\t' + gene + '\t' + clin_sig + '\t' + LoF + '\t' + LoF_info + '\t' + ExAC_AF_NFE + '\t' + gnomAD_exomes_AF_nfe + '\t' + CADD_PHRED + '\t' + REVEL + '\t' + l_het + '\t' + l_hom_alts + '\n')
if output_in_vcf_format:
vcf_writer.write_record(record)
if output_in_vcf_format:
vcf_writer.close()
output_txt.close()
# python filter_variants.py -h
# python filter_variants.py chr2.vcf.gz -t1