-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmap_annotation_via_orthologs_gh.py
82 lines (65 loc) · 2.15 KB
/
map_annotation_via_orthologs_gh.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
### Boas Pucker ###
### v0.3 ###
### [email protected] ###
#updated with Araport11 annotation (March 2017)
__usage__ = """"
python map_annotation_via_orthologs.py\n
--in <INPUT_FILE>
--out <OUTPUT_FILE>
bug reports and feature requests: [email protected]
"""
import re, sys
# --- end of imports --- #
def load_annotation( annotation_file ):
"""! @brief load all content from annotation file """
mapping_table = {}
with open( annotation_file, "r" ) as f:
line = f.readline()
while line:
parts = line.strip().split('\t')
if len( parts ) > 1:
if parts[2] == "":
annotation = "N/A"
else:
annotation = parts[1] + "." + parts[2]
mapping_table.update( { parts[0].upper(): annotation } )
line = f.readline()
return mapping_table
def annotate_genes( mapping_table, data_input_file, output_file ):
"""! @brief reads data from input file and maps annotation to geneID; results are written into output file """
counter = 0
with open( output_file, "w" ) as out:
with open( data_input_file, "r" ) as f:
f.readline() #header
line = f.readline()
while line:
parts = line.strip().split('\t')
AGIs = re.findall( "AT[12345CM]{1}G\d{5}", line.upper() )
if ',' in parts[1]:
genes = parts[1].split(', ')
else:
genes = [ parts[1] ]
anno = []
for AGI in AGIs:
try:
anno.append( mapping_table[ AGI ] )
counter += len( genes )
except KeyError:
anno.append( AGI + ". n/a" )
anno = "_%%%_".join( anno )
for gene in genes:
new_line = [ gene, anno ]
out.write( '\t'.join( new_line ) + '\n' )
line = f.readline()
print "number of annotated genes: " + str( counter )
def main( arguments ):
""""! @brief runs all functions for mapping of annotation """
annotation_file = "araport11_annotation.txt"
mapping_table = load_annotation( annotation_file )
data_input_file = arguments[ arguments.index( '--in' ) + 1 ]
output_file = arguments[ arguments.index( '--out' ) + 1 ]
annotate_genes( mapping_table, data_input_file, output_file )
if '--in' in sys.argv and '--out' in sys.argv:
main( sys.argv )
else:
sys.exit( __usage__ )