-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprepareGeneListsWholeGenome.py
75 lines (50 loc) · 1.75 KB
/
prepareGeneListsWholeGenome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#######################################
## ##
## prepareGeneListsWholeGenome.py ##
## ##
#######################################
#
# Purpose: preparing lists of genes covering the whole genome
#
#
# Output:
# - several lists, with at most 20 genes in each
#
from time import localtime, strftime
import re
import ast
dir = "./mm10_input/"
refGene = "refGene.txt"
pattern = r"\d+\t(N[MR]_\d+)\t(chr[MXY\d]+)\t([\+-])\t\d+\t\d+\t(\d+)\t(\d+)\t\d+\t([\d,]+)\t([\d,]+)\t0\t(.+)\t(cmpl|unk|incmpl)\t(cmpl|unk|incmpl)"
max_number_of_genes = 25
print strftime("%H:%M:%S", localtime())+": Preparing lists covering the whole genome."
# load all genes from reference genome
print strftime("%H:%M:%S", localtime())+":\tLoading genes from reference genome."
refSeqGenes = []
inFile = open(dir+refGene,'r')
for line in inFile:
match = re.search(pattern,line.rstrip())
if match:
geneName = match.group(8)
if geneName not in refSeqGenes:
refSeqGenes.append(geneName)
inFile.close()
print strftime("%H:%M:%S", localtime())+":\t"+str(len(refSeqGenes))+" are yet to be listed."
# Finally, we create the lists
print strftime("%H:%M:%S", localtime())+":\tSaving new lists."
list_index = 0
for i in range(0,len(refSeqGenes)):
if(i%max_number_of_genes==0):
list_index += 1
filename = "list_%03d.txt" % list_index
if i>0:
outFile.close()
outFile = open(dir+filename,'w')
outFile.write(refSeqGenes[i]+"\n")
outFile.close()
print strftime("%H:%M:%S", localtime())+": Done. "+str(list_index)+" lists created."
#######################
## ##
## End of File ##
## ##
#######################