1
+ """This module contains the SPARCCentralDistributionManager class, which can manage all orientation-based terms of SPARC."""
2
+
3
+ from distributions import *
4
+ from secondary_structure import *
5
+ from memory_profiler import profile
6
+
7
+ sparc_consecutive_mode = 'consec'
8
+ sparc_short_range_mode = 'short_range'
9
+ sparc_long_range_mode = 'long_range'
10
+ sparc_secondary_mode = 'secondary'
11
+ sparc_consec_secondary_mode = 'consec_secondary'
12
+ sparc_default_mode = 'default'
13
+
14
+ class SPARCCentralDistributionManager (FrequencyDistributionManager ):
15
+
16
+ def __init__ (self , frequencies_path , references = None ):
17
+ """frequencies_path should be a path to a directory of alpha zones paired with frequencies for individual amino acid pairs."""
18
+ self .alpha_frequencies = {}
19
+ self .reference_frequencies = None #{}
20
+ self .reference_totals = [[[0 for n in xrange (42 )] for i in xrange (AMINO_ACID_COUNT )] for j in xrange (AMINO_ACID_COUNT )]
21
+ self .total_interactions = [[[0 for n in xrange (42 )] for i in xrange (AMINO_ACID_COUNT )] for j in xrange (AMINO_ACID_COUNT )]
22
+ self .median_frequencies = [[[0 for n in xrange (42 )] for i in xrange (AMINO_ACID_COUNT )] for j in xrange (AMINO_ACID_COUNT )]
23
+ self .total_median = 0
24
+ self .identifier = os .path .basename (frequencies_path )
25
+ if references :
26
+ self .load_references (references )
27
+ self .load_frequencies (frequencies_path )
28
+ self .weight = 1.0
29
+ self .defaultvalue = 0
30
+ self .refstate = True
31
+
32
+ def __repr__ (self ):
33
+ return "<Distribution Manager for '{}' data>" .format (self .identifier )
34
+
35
+ def alpha_frequency (self , aa , aa2 , sec_name ):
36
+ """This helper function retrieves the frequency of the orientation between aa and aa2 in the loaded frequency data. sec_name is the string type for the secondary structure shared by both amino acids, if any."""
37
+ zone = aa .tolocal (aa2 .acarbon ).floor ()
38
+
39
+ alpha_freq = 0
40
+ reference_freq = 0
41
+ if zone in self .alpha_frequencies :
42
+ data_dict = self .alpha_frequencies [zone ][aacode (aa .type )][aacode (aa2 .type )]
43
+ separation = int (min (math .fabs (aa .tag - aa2 .tag ), 6 ) - 1 ) * 7
44
+ if sec_name :
45
+ struct_idx = next ((i for i , ss in enumerate ([None , "helix1" , "helix5" , "helix7" , "sheet0" , "sheet1" , "sheet-1" ]) if ss == sec_name ), 0 )
46
+ separation += struct_idx
47
+ if separation in data_dict :
48
+ alpha_freq = float (data_dict [separation ])
49
+ if zone in self .reference_frequencies :
50
+ data_dict = self .reference_frequencies [zone ][sec_name ]
51
+ separation = int (min (math .fabs (aa .tag - aa2 .tag ), 6 ) - 1 )
52
+ if separation in data_dict :
53
+ reference_freq = float (data_dict [separation ])
54
+ return (alpha_freq , reference_freq )
55
+
56
+ def subscore (self , protein , aa , aa2 , onlyone = False , zero_value = 0.01 ):
57
+ tag1 = aacode (aa .type )
58
+ tag2 = aacode (aa2 .type )
59
+ if tag1 >= AMINO_ACID_COUNT : tag1 = 0
60
+ if tag2 >= AMINO_ACID_COUNT : tag2 = 0
61
+ sec_struct = protein .secondary_structure_aa (aa .tag )
62
+ sec_struct_2 = protein .secondary_structure_aa (aa2 .tag )
63
+ if sec_struct and sec_struct_2 and sec_struct [1 ].start == sec_struct_2 [1 ].start :
64
+ sec_name = sec_struct [0 ].type + str (sec_struct [1 ].identifiers [0 ])
65
+ else :
66
+ sec_name = "default"
67
+
68
+ separation = int (min (math .fabs (aa .tag - aa2 .tag ), 6 ) - 1 ) * 7
69
+ if sec_name :
70
+ struct_idx = next ((i for i , ss in enumerate ([None , "helix1" , "helix5" , "helix7" , "sheet0" , "sheet1" , "sheet-1" ]) if ss == sec_name ), 0 )
71
+ separation += struct_idx
72
+
73
+ if self .refstate :
74
+ subscore , ref = self .alpha_frequency (aa2 , aa , sec_name )
75
+ if subscore == 0 :
76
+ subscore = zero_value
77
+ if ref == 0 :
78
+ ref = zero_value
79
+ subscore2 , ref2 = self .alpha_frequency (aa , aa2 , sec_name )
80
+ if subscore2 == 0 :
81
+ subscore2 = zero_value
82
+ if ref2 == 0 :
83
+ ref2 = zero_value
84
+ if onlyone :
85
+ print subscore , subscore2
86
+ return (subscore * self .weight , subscore2 * self .weight , self .total_interactions [tag1 ][tag2 ], self .total_interactions [tag2 ][tag1 ])
87
+ return - math .log ((subscore / self .total_interactions [tag1 ][tag2 ][separation ]) / (ref / self .reference_totals [sec_name ][int (min (math .fabs (aa .tag - aa2 .tag ), 6 ) - 1 )])) - math .log ((subscore2 / self .total_interactions [tag2 ][tag1 ][separation ]) / (ref2 / self .reference_totals [sec_name ][int (min (math .fabs (aa .tag - aa2 .tag ), 6 ) - 1 )]))
88
+ '''else:
89
+ zone = aa2.tolocal(aa.acarbon).floor()
90
+ subscore = self.alpha_frequency(tag2, tag1, zone)
91
+ if subscore == 0:
92
+ subscore = zero_value
93
+ subscore = -math.log(subscore / self.median_frequencies[tag2][tag1] * self.total_interactions[tag2][tag1] / self.total_median)
94
+ zone2 = aa.tolocal(aa2.acarbon).floor()
95
+ subscore2 = self.alpha_frequency(tag1, tag2, zone2)
96
+ if subscore2 == 0:
97
+ subscore2 = zero_value
98
+ subscore2 = -math.log(subscore2 / self.median_frequencies[tag1][tag2] * self.total_interactions[tag1][tag2] / self.total_median)
99
+ if onlyone:
100
+ print subscore, subscore2
101
+ return (subscore * self.weight, subscore2 * self.weight, self.total_interactions[tag1][tag2], self.total_interactions[tag2][tag1])
102
+ return subscore + subscore2'''
103
+
104
+ def score (self , protein , data , system = None , isolate = False , onlyone = False , prior = 2 , zero_value = 0.01 , mode = 'default' ):
105
+ """For frequency distributions, pass in an array of hypothetical aminoacids. This implementation returns the product of the frequencies of each pairwise interaction. If isolate=True, only the amino acids in data will be considered for the energy calculation.
106
+ Pass prior to consider ONLY the amino acid before (True) or after (False) each amino acid in data. This works best for consecutive modes."""
107
+ score = 0.0
108
+ taglist = {}
109
+
110
+ consec = 2
111
+ use_secondary = 2
112
+ use_short_range = 2
113
+ if mode == sparc_consecutive_mode :
114
+ consec = 1
115
+ use_secondary = 0
116
+ elif mode == sparc_secondary_mode :
117
+ consec = 1
118
+ use_secondary = 1
119
+ elif mode == sparc_consec_secondary_mode :
120
+ consec = 1
121
+ elif mode == sparc_short_range_mode :
122
+ consec = 0
123
+ use_short_range = 1
124
+ elif mode == sparc_long_range_mode :
125
+ consec = 0
126
+ use_short_range = 0
127
+
128
+ for aa in data :
129
+ if not aa : continue
130
+ if prior != 2 :
131
+ nearby = []
132
+ if aa .tag > 0 and prior != False : nearby .append (protein .aminoacids [aa .tag - 1 ])
133
+ if aa .tag < len (protein .aminoacids ) - 1 and prior != True : nearby .append (protein .aminoacids [aa .tag + 1 ])
134
+ else :
135
+ if system and not consec and use_short_range != 0 :
136
+ nearby = system .nearby_aa (aa , protein , 10.0 , consec = consec )
137
+ else :
138
+ nearby = protein .nearby_aa (aa , 10.0 , consec = consec )
139
+ for aa2 in nearby :
140
+ if not aa2 : continue
141
+ sec_struct = protein .secondary_structure_aa (aa .tag )
142
+ sec_struct_2 = protein .secondary_structure_aa (aa2 .tag )
143
+ if aa2 .tag - aa .tag == 1 and aa .has_break : continue
144
+ elif math .fabs (aa2 .tag - aa .tag ) > 5 and use_short_range == 1 : continue
145
+ elif math .fabs (aa2 .tag - aa .tag ) <= 5 and use_short_range == 0 : continue
146
+ elif use_secondary == 0 and sec_struct and sec_struct_2 and sec_struct [1 ].start == sec_struct_2 [1 ].start : continue
147
+ elif use_secondary == 1 and not (sec_struct and sec_struct_2 and sec_struct [1 ].start == sec_struct_2 [1 ].start ): continue
148
+ if (aa .tag in taglist and aa2 .tag in taglist [aa .tag ]) or (aa2 .tag in taglist and aa .tag in taglist [aa2 .tag ]):
149
+ continue
150
+ hypo = next ((x for x in data if x and x .tag == aa2 .tag ), None )
151
+ if hypo is not None : aa2 = hypo
152
+ elif isolate : continue
153
+
154
+ try :
155
+ subscore = self .subscore (protein , aa , aa2 , onlyone , zero_value )
156
+ except ZeroDivisionError :
157
+ subscore = 0.0
158
+
159
+ if onlyone : return subscore
160
+ else : score += subscore
161
+
162
+ if aa .tag in taglist :
163
+ taglist [aa .tag ].append (aa2 .tag )
164
+ else :
165
+ taglist [aa .tag ] = [aa2 .tag ]
166
+ return score * self .weight
167
+
168
+ def read_frequency_line (self , line , tag1 , tag2 ):
169
+ """Helper method for load_frequencies, intended for subclasses to easily modify the reading procedure."""
170
+ ptcomps , freqs = line .strip ().split (";" )
171
+ alpha = Point3D (* ptcomps .split ("," ))
172
+ if alpha not in self .alpha_frequencies :
173
+ self .alpha_frequencies [alpha ] = [[{} for i in xrange (AMINO_ACID_COUNT )] for k in xrange (AMINO_ACID_COUNT )]
174
+ freqs = [freq for freq in freqs .split ("," ) if len (freq )]
175
+ for sep , freq in enumerate (freqs ):
176
+ freq = int (freq )
177
+ if freq != 0 :
178
+ self .alpha_frequencies [alpha ][tag1 ][tag2 ][sep ] = freq
179
+ self .total_interactions [tag1 ][tag2 ][sep ] += freq
180
+
181
+ def load_frequencies (self , path ):
182
+ files = os .listdir (path )
183
+ self .total_interactions = [[[0 for k in xrange (42 )] for i in xrange (AMINO_ACID_COUNT )] for j in xrange (AMINO_ACID_COUNT )]
184
+ loading_indicator .add_loading_data (len (files ))
185
+ for n , indfile in enumerate (files ):
186
+ loading_indicator .update_progress (1 )
187
+ if indfile .find (".txt" ) == - 1 or indfile [0 ] == "." : continue
188
+ tag1 , tag2 = indfile [0 :- 4 ].split ('-' )
189
+ tag1 = int (tag1 )
190
+ tag2 = int (tag2 )
191
+ with open (join (path , indfile ), 'r' ) as file :
192
+ for line in file :
193
+ if ";" not in line :
194
+ if len (line .strip ()) > 0 :
195
+ self .median_frequencies [tag1 ][tag2 ] = [float (y ) for y in line .split ("," )]
196
+ continue
197
+ self .read_frequency_line (line , tag1 , tag2 )
198
+ #Compute median total frequency
199
+ #s = sorted([x for list1 in self.total_interactions for x in list1])
200
+ #self.total_median = sum(s) / float(len(s)) #s[int(len(s) / 2.0)]
201
+
202
+ def load_references (self , path ):
203
+ files = os .listdir (path )
204
+ percentage = 0
205
+ self .reference_frequencies = {}
206
+ self .reference_totals = secondary_structures_dict ([0 for i in xrange (7 )])
207
+ self .reference_totals ["default" ] = [0 for i in xrange (7 )]
208
+ self .reference_totals ["all" ] = [0 for i in xrange (7 )]
209
+ loading_indicator .add_loading_data (len (files ))
210
+ for n , indfile in enumerate (files ):
211
+ loading_indicator .update_progress (1 )
212
+ if indfile .find (".txt" ) == - 1 : continue
213
+ sec_name = indfile [0 :- 4 ]
214
+ with open (join (path , indfile ), 'r' ) as file :
215
+ for line in file :
216
+ if ";" not in line or len (line .strip ()) == 0 :
217
+ continue
218
+ ptcomps , freqs = line .strip ().split (";" )
219
+ alpha = Point3D (* ptcomps .split ("," ))
220
+ if alpha not in self .reference_frequencies :
221
+ self .reference_frequencies [alpha ] = secondary_structures_dict ()
222
+ self .reference_frequencies [alpha ]["default" ] = {}
223
+ self .reference_frequencies [alpha ]["all" ] = {}
224
+ freqs = [freq for freq in freqs .split ("," ) if len (freq )]
225
+ for sep , freq in enumerate (freqs ):
226
+ if int (freq ) != 0.0 :
227
+ self .reference_frequencies [alpha ][sec_name ][sep ] = int (freq )
228
+ self .reference_totals [sec_name ][sep ] += int (freq )
229
+ print "Loaded references"
230
+
231
+ class SPARCCentralDistributionPuppet (object ):
232
+ """The SPARCCentralDistributionPuppet class provides objects that can act like individual frequency managers, while all the time referring back to a single centralized distribution manager. Simply initialize the object with a manager object and a mode (see the top of the module), then use it by calling the score() method as you would any FrequencyDistributionManager."""
233
+
234
+ def __init__ (self , manager , mode = 'default' , weight = 1.0 ):
235
+ self .manager = manager
236
+ self .mode = mode
237
+ self .weight = weight
238
+ self .identifier = self .mode
239
+ self .short_range = 2
240
+ self .blocks_secondary_structures = 2
241
+ if self .mode == sparc_consecutive_mode :
242
+ self .type = frequency_consec_disttype
243
+ self .blocks_secondary_structures = 1
244
+ elif self .mode == sparc_secondary_mode or self .mode == sparc_consec_secondary_mode :
245
+ self .type = frequency_consec_disttype
246
+ self .blocks_secondary_structures = 0
247
+ elif self .mode == sparc_long_range_mode :
248
+ self .type = frequency_nonconsec_disttype
249
+ self .short_range = 0
250
+ elif self .mode == sparc_short_range_mode :
251
+ self .type = frequency_nonconsec_disttype
252
+ self .short_range = 1
253
+
254
+ def score (self , protein , data , system = None , isolate = False , onlyone = False , prior = 2 , zero_value = 0.01 ):
255
+ """This method funnels through to the puppet's original central manager, passing in the mode parameter."""
256
+ return self .manager .score (protein , data , system , isolate , onlyone , prior , zero_value , self .mode ) * self .weight
0 commit comments