-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathassoc2qqman.py
executable file
·80 lines (63 loc) · 2.36 KB
/
assoc2qqman.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/python
import sys
import os
import getopt
import re
'''
AUTHOR: Johan Zicola
USAGE: assoc2qqman.py <prefix.assoc.txt>
Read the output of gemma 'prefix.assoc.txt' derived
from the association analysis command and performs
readjustment of the file to be compatible for Read
R analysis with the package 'qqman'. Note that chromosomes
should named as digits.
Compatible Python27 and Python37
'''
# Function to test whether the string following "Chr"
# in the chromosome name is a digit or not. This allow
# to discard lines with ChrM or ChrC for instance
def is_digit(x):
try:
int(x)
return True
except ValueError:
return False
def main():
# Open the file
input_file = sys.argv[1]
# Check if input file exists
if os.path.isfile(input_file):
input_file = open(input_file, "r")
lines_file = input_file.read().splitlines()
else:
sys.exit("Error: input file does not exist")
# Get through the file and parse the fiels to reorganize them as expected
# by manahattan R package
for line in lines_file:
if line[:3] == "chr": #First line should be header (chr per default in gemma output)
header = "SNP\tCHR\tBP\tP\tzscore"
print(header)
else:
line = line.strip().split("\t") # Get rid of EOL and Create a list based on \t separation
# Get chromosome name (should be a digit) and the name of the SNP
# on the chromosome
CHR = line[0]
SNP = line[1]
# If CHR=0, it means the chromosome names in the VCF file are not pure digit
# then, retrieve chromosome name from the SNP field and remove all non digits
if int(CHR) == 0:
CHR = re.split(':', SNP)[0]
CHR = re.sub("[^0-9]", "", CHR)
# Skip the SNP if the CHR is not a digit)
if not is_digit(CHR):
continue
# Get the information for the other fields
# (note that it works when GEMMA option -lmm 2 is set)
BP = line[2]
P = line[8]
zscore = line[7]
new_line = SNP, CHR, BP, P, zscore
print("\t".join(new_line))
input_file.close()
if __name__ == "__main__":
sys.exit(main())