-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocess.py
268 lines (242 loc) · 11.5 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import numpy as np
import sys, os
import gzip
import pandas as pd
def merge_rseq_reps(cell_id, id2name, data_path="data/encode/raw_data"):
"""Merge gene expression across replicates give one cell type or tissue id.
Args:
cell_id: ID for a celltype or tissue. E.g., 1
id2name: dic mapping gene id to gene name
data_path: path to data
Return:
A dic map gene name to its TPM value.
"""
num_reps = 0
name2tpm = {}
for exp in os.listdir('%s/%d/rseq' % (data_path, cell_id)):
for rseq_file in os.listdir('%s/%d/rseq/%s' % (data_path, cell_id, exp)):
if rseq_file[-3:] == "tsv":
tsv_file = '%s/%d/rseq/%s/%s' % (data_path, cell_id, exp, rseq_file)
for line in open(tsv_file).readlines()[1:]:
gene_id = line.split('\t')[0].split('.')[0]
if gene_id not in id2name.keys():
continue
tpm_value = float(line.split('\t')[5])
gene_name = id2name[gene_id]
if gene_name in name2tpm.keys():
name2tpm[gene_name].append(tpm_value)
else:
name2tpm[gene_name] = [tpm_value]
num_reps += 1
for item in name2tpm.items():
if len(item[1]) != num_reps:
pass
#print(item[0],len(item[1]))
average_tpm = {item[0]:np.mean(item[1]) for item in name2tpm.items()}
return average_tpm
def get_cell_ids(num_celltypes=39):
"""get valid cell ids by removing cell types with missing data.
Return:
A cell id list.
"""
missing_ids = [8,23,25,30,32,33,34,35,38,39,17]
return [item for item in list(range(1,num_celltypes+1)) if item not in missing_ids]
def get_gene_annot(gene_annot_file='data/encode/gencode.v19.annotation.gtf.gz'):
""" get gene annotation from a annotation file.
Arg:
cell_id: ID for a celltype or tissue. E.g., 1
gene_annot_file: gene annotation file from ENCODE
URL: https://www.encodeproject.org/files/gencode.v19.annotation/@@download/gencode.v19.annotation.gtf.gz
FORMAT: https://www.gencodegenes.org/pages/data_format.html
Returns:
Two dics
First dic mapping gene id (ENSGXXXXXXXXXXX) to gene name;
Second dic mapping gene name to gene location.
"""
id2name = {}
gene2loc = {}
lines = gzip.open(gene_annot_file).readlines()
for line in lines:
if line.decode("utf-8").startswith('#'):
continue
trans_type = line.decode("utf-8").split('\t')[-1].strip().split('; ')[5].split(' ')[1].strip('"').split('.')[0]
gene_type = line.decode("utf-8").split('\t')[-1].strip().split('; ')[2].split(' ')[1].strip('"').split('.')[0]
if trans_type != 'protein_coding' or gene_type != 'protein_coding':
continue
if line.decode("utf-8").split('\t')[2] != 'gene':
continue
chrom = line.decode("utf-8").split('\t')[0]
start = line.decode("utf-8").split('\t')[3]
end = line.decode("utf-8").split('\t')[4]
loc = chrom + ':' + start + '-' + end
gene_id = line.decode("utf-8").split('\t')[-1].strip().split('; ')[0].split(' ')[1].strip('"').split('.')[0]
gene_name = line.decode("utf-8").split('\t')[-1].strip().split('; ')[4].split(' ')[1].strip('"')
id2name[gene_id] = gene_name
gene2loc[gene_name] = loc
#NR2E3 and ZSCAN26 not protein coding, but associated with motif
#We manually add the two genes.
id2name['ENSG00000031544'] = 'NR2E3'
gene2loc['NR2E3'] = 'chr15:72084977-72110600'
id2name['ENSG00000197062'] = 'ZSCAN26'
gene2loc['ZSCAN26'] = 'chr6:28234788-28245974'
return id2name, gene2loc
def get_tf_motif_match():
"""need to be improved, generate a two-column file, motif-tf
Return:
Two items
First item: a list of TF(gene) name (711 TFs).
Second item: dic mapping motif name to tf (gene) name
"""
from scipy.io import loadmat
motif_info = loadmat('data/motif/MotifMatch_human_rmdup.mat')
motif_list = [item.split('\t')[1] for item in open('data/motif/all_motif_rmdup.motif').readlines() \
if item.startswith('>') ]
motif2tf = {}
for each in motif_info['Match2']:
motif = each[0][0]
tf = each[1][0]
if motif not in motif2tf.keys():
motif2tf[motif] = [tf]
else:
motif2tf[motif]+=[tf]
# 1315 out of 1465 motifs could find matched tfs.
motif_list_filtered = [item for item in motif2tf.keys() if item in motif_list]
tf_list = []
for each in motif_list_filtered:
tf_list += motif2tf[each]
tf_list = list(set(tf_list))
tf_list.sort()
isContain = {item : True if item in motif_list_filtered else False for item in motif_list}
return tf_list, {item : motif2tf[item] for item in motif_list_filtered}, isContain
def quantile_norm(df):
"""quantile normalization
Arg:
df: Pandas DataFrame with genes x cells
Return:
normalized Pandas DataFrame
"""
rank_mean = df.stack().groupby(df.rank(method='first').stack().astype(int)).mean()
return df.rank(method='min').stack().astype(int).map(rank_mean).unstack()
def get_sorted_gene(gene2loc):
"""sorted genes according to the genomic coordinate
Return:
A list of sorted genes.
"""
chrom2index = {'chr%d' % item : item for item in range(1,23)}
chrom2index['chrX'] = 23
chrom2index['chrY'] = 24
chrom2index['chrM'] = 25
locs = [[item[0], chrom2index[item[1].split(':')[0]], int(item[1].split(':')[1].split('-')[0]),
int(item[1].split('-')[1])] for item in gene2loc.items()]
locs.sort(key = lambda x : (x[1],x[2]))
return [item[0] for item in locs]
def aggregate_rseq(cell_ids, id2name, gene2loc, tf_list, use_norm = True, save = True):
"""aggregate RNA-seq data across all filtered cell types/tissues.
generate full with gene location, generate only tf, save two panda files.
"""
sorted_genes = get_sorted_gene(gene2loc)
sorted_locs = [gene2loc[item] for item in sorted_genes]
aggregated_tf_expr = np.empty((len(tf_list), len(cell_ids)), dtype = 'float32')
aggregated_full_expr = np.empty((len(sorted_genes), len(cell_ids)), dtype = 'float32')
for cell_id in cell_ids:
average_tpm = merge_rseq_reps(cell_id, id2name)
tf_tpm = [average_tpm[item] for item in tf_list]
full_tpm = [average_tpm[item] for item in sorted_genes]
aggregated_tf_expr[:, cell_ids.index(cell_id)] = tf_tpm
aggregated_full_expr[:, cell_ids.index(cell_id)] = full_tpm
pd_aggregated_tf_expr = pd.DataFrame(data = aggregated_tf_expr,
index = tf_list,
columns = cell_ids)
pd_aggregated_full_expr = pd.DataFrame(data = aggregated_full_expr,
index = [sorted_genes, sorted_locs],
columns = cell_ids)
if use_norm:
pd_aggregated_tf_expr = quantile_norm(pd_aggregated_tf_expr)
pd_aggregated_full_expr = quantile_norm(pd_aggregated_full_expr)
if save:
pd_aggregated_tf_expr.to_csv('data/encode/aggregated_tf_expr.csv', sep = '\t')
pd_aggregated_full_expr.to_csv('data/encode/aggregated_full_expr.csv', sep = '\t')
def get_motif_score(tf_list, motif2tf, isContain, part_id,
region_file = 'data/motif/selected.128k.bin.homer.bed',
save = True):
"""parse the motifscan results by homer tool.
Args:
region_file: path to bed file that was used for motif scanning
motifscan_file: path to the motifscan file from Homer tool
"""
import time
regions = ['_'.join(item.split('\t')[:4]) for item in open(region_file).readlines()]
#13300000 regions (bins), 711 TFs
#motifscore = np.empty((len(regions), len(tf_list)), dtype = 'float32')
print(len(regions),len(tf_list))
def parse_motifscan(part_id):
motifscan_file = 'data/motif/motifscan.128k.p%d.txt' % part_id
motifscore = np.empty((len(regions), len(tf_list)), dtype = 'float32')
print(motifscan_file)
count = 0
f_motifscan = open(motifscan_file,'r')
#skip header
line = f_motifscan.readline()
line = f_motifscan.readline()
while line != '':
if count % 500000 == 0:
print(motifscan_file, count)
motif = line.split('\t')[3]
if isContain[motif]:
region_id = line.split('\t')[0]
score = float(line.split('\t')[-1].rstrip())
for tf in motif2tf[motif]:
previous = motifscore[int(region_id)-1][tf_list.index(tf)]
motifscore[int(region_id)-1][tf_list.index(tf)] = max(score, previous)
line = f_motifscan.readline()
count += 1
f_motifscan.close()
np.save('data/motif/motifscore.p%d.npy' % part_id, motifscore)
#motifscan_file = 'data/motif/motifscan.128k.txt'
parse_motifscan(part_id)
# pd_motifscore = pd.DataFrame(data = motifscore,
# index = regions,
# columns = tf_list)
# if save:
# pd_motifscore.to_csv('data/motif/motifscore.128k.csv', sep = '\t')
# np.save('data/motif/motifscore.npy',motifscore)
def get_targets(cell_ids, bams_file = 'data/encode/bams_list.txt',
region_file = 'data/motif/selected.128k.bin.homer.bed',
save = True):
"""extract the target for geformer (DNase-seq, and 7 ChIP-seq signals)
Args:
bams_file: a text records the bam file path for each cell type/tissue
Return:
A numpy array with shape [num_regions, num_targets] for each cell type/tissue
"""
regions = ['_'.join(item.split('\t')[:4]) for item in open(region_file).readlines()]
targets = ['dseq', 'cseq/CTCF-human', 'cseq/H3K27ac-human', 'cseq/H3K4me3-human', \
'cseq/H3K36me3-human', 'cseq/H3K27me3-human', 'cseq/H3K9me3-human', \
'cseq/H3K4me1-human']
cell2targets = {item : np.empty((len(regions),len(targets))) for item in cell_ids}
targets_data = np.empty((len(cell_ids), len(regions), len(targets)))
for line in open(bams_file).readlines():
cell_id = int(line.split('/')[3])
assert cell_id in cell_ids
target_file = line.split('/')[-1].strip().replace('bam','128k.bed')
target_path = '/'.join(line.split('/')[:-1] + [target_file])
print(target_path)
target = 'dseq' if 'dseq' in line else line.split('/')[4] + '/' + line.split('/')[5]
cell2targets[cell_id][:, targets.index(target)] = np.loadtxt(target_path,
delimiter = '\t', usecols = 3)
for cell_id in cell_ids:
targets_data[cell_ids.index(cell_id), :, :] = cell2targets[cell_id]
if save:
np.save('data/encode/targets_data.npy', targets_data)
for cell_id in cell_ids:
pd_data = pd.DataFrame(data = cell2targets[cell_id],
index = regions,
columns = targets)
pd_data.to_csv('data/encode/target_data/%d.target.csv' % cell_id, sep = '\t')
if __name__ == "__main__":
cell_ids = get_cell_ids()
tf_list, motif2tf, isContain = get_tf_motif_match()
id2name, gene2loc = get_gene_annot()
aggregate_rseq(cell_ids, id2name ,gene2loc, tf_list)
get_targets(cell_ids)
get_motif_score(tf_list, motif2tf, isContain, int(sys.argv[1]))