-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathosteo_bagging.py
183 lines (146 loc) · 6.72 KB
/
osteo_bagging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/python
# use Python-3.4
# python /home/unix/wcchou/gsapWenChi/gautvik/bin/gwas_bagging/GWAS_enrichment.arg.py ../bagging/bagSNP.chr1.print.txt ../../data/gwas.2.5m/all.pvalue/HEIGHT_hg19_Pvalue 1
# headers
import sys
import pandas as pd
import numpy as np
import re
import os
import gzip
from pandas import DataFrame
## Problem shooting -- p value need to match bags, generate p value file
#read in the SNP bag file
# wcc arg3 = chromosome number
# wcc arg1
#bagSNP_file = open("C:\\Users\\User\\Desktop\\eQTL_project\\bagging\\finished.Bagging.NoMissing\\bagSNP.chr1.print.txt", 'r').read()
#sys.argv[1] = "/home/simon/eQTL_project/GTeX_enrichment/eQTL_finshed_bagging/bagSNP.chr1.print.txt"
inputeQTLLD_filePath = sys.argv[1]
#sys.argv[2] = "/home/simon/eQTL_project/GTeX_enrichment/Muscle_Skeletal_Analysis_chrpos_pval.txt"
inputGWAS_filePath = sys.argv[2]
#sys.argv[3] = '1'
chromosome = sys.argv[3]
outputDirName = sys.argv[4]
bagSNP_file = open(inputeQTLLD_filePath, 'r').read()
#GWAS_file = gzip.open(inputGWAS_filePath, 'r')
print ("working on chromosome " + chromosome)
print ("input eQTL LD file: " + inputeQTLLD_filePath)
print ("input GTeX file: " + inputGWAS_filePath)
# mkdir a output dir
#outputDirName = "/restricted/projectnb/chengcancer/Simon/project/T2D/bulk_RNA-seq/merge_170415_NB501164_170506_NB501164/2mapping/QT/outdir"
#This original code would occur race conditional error sometimes
if not os.path.exists(outputDirName):
os.makedirs(outputDirName)
#while True:
# try:
# os.makedirs(outputDirName)
# break
# except OSError, e:
# if e.errno != 17:
# raise
# time.sleep might help here
# pass
print ("output Folder is at " + outputDirName)
outputBagSNPFilePath=outputDirName + '/GTeXbag.chr' + sys.argv[3] + '_bagSNP'
outputBagPvalueFilePath=outputDirName + '/GTeXbag.chr' + sys.argv[3] + '_bagPvalue'
print ("output bagSNP file is at " + outputBagSNPFilePath)
print ("output bagPvalue file is at " + outputBagPvalueFilePath)
# split the strings of the bagSNP by spaces and newlines
print ("Loading eQTL bag ...")
bagSNP = []
bagidx = []
bag_number = 1
for line in range(len(bagSNP_file.splitlines())):
temp = (bagSNP_file.splitlines()[line])
words = temp.split()
idx = 0
for idx in range(len(words)):
bagSNP.append(words[idx])
bagidx.append(bag_number)
#print (bag_number)
bag_number += 1
print ("There are ", bag_number, " LD bags in the input eQTL LD file.")
# For chr1, bqg_number should be 93418
# Create a Pandas dataframe table
# this table have each row of bagSNPs and index as their bag numbers
bag_df = pd.DataFrame({"SNPs" : bagSNP, "bag index" : bagidx})
#wcc020417 bag_df.sort_values(by="SNPs")
#bag_df
#import GWAS file
# wcc arg2
#GWAS_file = open("C:\\Users\\User\\Desktop\\eQTL_project\\python_data\\HEIGHT_hg19_Pvalue", 'r')
# read the GWAS line by line
GWAS_SNP = []
GWAS_Pval = []
count = 1
gwasFileLineCount = 0
#with open(inputGWAS_filePath) as GWAS_file:
with gzip.open(inputGWAS_filePath, 'rt') as GWAS_file:
for line in GWAS_file:
line = line.strip()
gwasFileLineCount += 1
p = line.split()
# wcc 1=arg3=chr## Need to change the number if we bag GWAS with different chromosome !!!
#if(p[0].split(":")[0] == '1'):
if (p[0].split(":")[0] == chromosome):
GWAS_SNP.append(p[0].split(":")[1])
GWAS_Pval.append(p[1])
count += 1
print ("There ", gwasFileLineCount, " SNPs in the input GWAS file.")
print ("There ", count, " GTeX SNPs in selected chromosome.")
# for HEIGHT GWAS, there should be 2481248 lines, for chromosome 1, there should be 188142 lines
# already check that it doesn't contain chr 1X in the list p
# Create a Pandas dataframe table
# this table have each row of GWASSNPs and index as their p values
GWAS_df = pd.DataFrame({"SNPs" : GWAS_SNP, "P Value" : GWAS_Pval})
#wcc020417 GWAS_df = GWAS_df.sort_values(by="SNPs")
## create a dataframe for storing the mapped results
# Create a Pandas dataframe table
# this table have each row of SNPs and index as their bag index
GWASbag_df = pd.merge(bag_df, GWAS_df, on="SNPs") # 161068 rows for chr1 and HEIGHT
#GWASbag_df = GWASbag_df[~GWASbag_df["P Value"].str.contains("NA")] #160951
#GWASbag_df = GWASbag_df[~GWASbag_df["P Value"].str.contains("NaN")]
GWASbag_df.dropna() ## Need to reset the bag index
#wcc020417 GWASbag_df = GWASbag_df.sort_values(by = "bag index")
GWASbag_df = GWASbag_df.reset_index(drop=True) # reset the index of rows, not index of bags
# Create a dataframe table for storing (bag index - representative p value) information
GWAS_bag_amounts = len(GWASbag_df["bag index"].unique()) # 44708 bags for chr1 and HEIGHT
Mapped_GWAS_SNP_amounts = len(GWASbag_df["SNPs"])
Missed_GWAS_SNP_amounts = len(GWAS_df["SNPs"]) - Mapped_GWAS_SNP_amounts
bag_number # original eQTL bag numbers
# Solve the jump number of bag index problem
newbagidx = []
idx = 1
newbagidx.append(idx)
for j in range (1, len(GWASbag_df)): # len(GWASbag_df) = 161068 for HEIGHT chr1 ## 1~161067
if (GWASbag_df["bag index"] [j] != GWASbag_df["bag index"] [j - 1]):
idx += 1
newbagidx.append(idx)
else:
newbagidx.append(idx)
GWASbag_df = pd.DataFrame({"SNPs" : GWASbag_df["SNPs"], "P Value" : GWASbag_df["P Value"], "bag index" : newbagidx})
# for each bag index in merged GWASbag_df , find the smallest p value, assign to the new GWAS_bag_result_df dataframe
GWASbagidx = []
smallestPval = []
for idx in range(GWAS_bag_amounts):
df = (GWASbag_df[GWASbag_df["bag index"] == idx + 1])
smallestPval.append(df["P Value"].min())
GWASbagidx.append(idx + 1)
GWAS_bag_result_df = pd.DataFrame({"bag index": GWASbagidx, "P Value": smallestPval})
GWAS_bag_result_df
# space delimited output
# space delimited output
# wcc arg4 pvalue outputfile
#GWAS_bag_result_df['P Value'].to_csv('C:\\Users\\User\\Desktop\\eQTL_project\\python_data\\%s_bagPval_Chr1.txt'%"HEIGHT_hg19" , header=None, index=None, mode='w')
GWAS_bag_result_df['P Value'].to_csv(outputBagPvalueFilePath, header=None, index=None, sep=' ', mode='w')
# wcc arg5 SNP outputfile"
#outFile = open('C:\\Users\\User\\Desktop\\eQTL_project\\python_data\\%s_bagSNP_Chr1.txt'% "HEIGHT_hg19", 'w')
outFile = open(outputBagSNPFilePath, 'w')
for bagidx in range(GWAS_bag_amounts):
test = GWASbag_df[GWASbag_df["bag index"] == (bagidx + 1)].sort_values(by = "P Value")
a = test.groupby("bag index")["SNPs"].apply(tuple)
for result in a:
result = ' '.join(result)
outFile.write(result + '\n')
GWAS_file.close()
outFile.close()