-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclinvar_vcf_to_gene_disease_table.py
executable file
·84 lines (75 loc) · 2.51 KB
/
clinvar_vcf_to_gene_disease_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/local/Anaconda/envs/py3.5/bin/python
import sys
import gzip
import subprocess
vcf = gzip.open(sys.argv[1],'rt')
def gtf_reader(gtf):
gtf_bed = {}
for line in gtf:
if line[0] == '#':
continue
line = line.split('\t')
info = line[8]
chr, start, end = line[0], int(line[3]), int(line[4])
try:
gene = [g.split('"')[1].upper() for g in info.split(';') if 'gene_name' in g][0]
except:
pass
if gene not in gtf_bed:
gtf_bed[gene] = chr, start, end
else:
old_start, old_end = gtf_bed[gene][1:3]
new_start = min(old_start, start)
new_end = max(old_end, end)
gtf_bed[gene] = chr, new_start, new_end
return(gtf_bed)
# hand corrections
gene_corr = {'CYTB': 'MT-CYB','COX1': 'PTGS1', 'COX2': 'PTGS2', 'ZAK' : 'AC013461.1', 'ZFP112': 'ZNF112'}
# roll through vcf and grab gene and collapse diasease by variant to the the gene level
gene_disease = {}
for line in vcf:
if line[0] == '#':
continue
info = line.split('\t')[7]
try:
gene = [gi.split('=')[1].split(':')[0] for gi in info.split(';') if 'GENEINFO' in gi][0]
disease = [d.split('=')[1] for d in info.split(';') if 'CLNDBN' in d][0]
except:
sys.stderr.write('Gene or disease missing in line: ' + info)
if gene in gene_corr:
gene = gene_corr[gene]
if gene not in gene_disease:
gene_disease[gene] = disease
else:
existing_disease = gene_disease[gene]
updated_disease = existing_disease + '|' + disease
disease_set = list(set(updated_disease.split('|')))
disease_set = [x for x in disease_set if x not in ['.','not_provided', 'not_specified']]
disease_set = '|'.join(disease_set)
gene_disease[gene] = disease_set
# pull in gtf to create bed
# one line for each gene, using min and max start/stop coordinates
gtf_bed = {}
gtf = gzip.open(sys.argv[2],'rt')
gtf_bed = gtf_reader(gtf)
gtf_alt = gzip.open(sys.argv[3],'rt')
gtf_bed_alt = gtf_reader(gtf_alt)
# match up gene_disease against gene_coordinates
for key in gene_disease:
u_key = key.upper()
if u_key in gtf_bed:
start= str(gtf_bed[u_key][1])
end = str(gtf_bed[u_key][2])
print(gtf_bed[u_key][0] + '\t' + start + '\t' + end + '\t' + gene_disease[key])
elif u_key in gtf_bed_alt:
start= str(gtf_bed_alt[u_key][1])
end = str(gtf_bed_alt[u_key][2])
print(gtf_bed_alt[u_key][0] + '\t' + start + '\t' + end + '\t' + gene_disease[key])
else:
u_key = 'MT-' + u_key
try:
start= str(gtf_bed[u_key][1])
end = str(gtf_bed[u_key][2])
print(gtf_bed[u_key][0] + '\t' + start + '\t' + end + '\t' + gene_disease[key])
except:
sys.stderr.write(key + ' not in GTFs\n')