-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdistributions.py
321 lines (295 loc) · 12.8 KB
/
distributions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
from aminoacids import *
from os.path import join
import os
from math import floor, fabs
import datetime
import loading_indicator
frequency_disttype = 0
frequency_nonconsec_disttype = 1
frequency_consec_disttype = 2
medium_disttype = 3
other_disttype = 4
class DistributionManager(object):
'''Abstract base class whose concrete instances are passed to the folding simulator. Instances of DistributionManager's subclasses should provide a score for a particular amino acid positional relationship. Instances may be passed a list of amino acids, in which the first amino acid should be considered the subject of scoring.
Only interactions in which the second amino acid's tag is greater than the subject amino acid should be scored (preventing scores from being doubled).'''
def __init__(self):
self.type = other_disttype
self.weights = []
def score(self, protein, data):
"""score() must be implemented by all instances, and is used to determine the feasibility of a protein conformation."""
raise NotImplementedError("Instances of DistributionManager must implement score().")
#MARK: - Frequency
class FrequencyDistributionManager(DistributionManager):
'''Semi-abstract subclass of DistributionManager that provides file-handling methods that parse the standard amino acid frequency rankings. Can be used for nonconsecutive and consecutive distribution data.'''
def __init__(self, rankings_path, frequencies_path, axisradii_path=None):
"""FrequencyDistributionManager maintains a series of datasets for retrieving the frequencies of various situations, including alpha carbon zones and axis configurations.
++ rankings_path should provide the path to a directory of top-ranked alpha zones and their most common axis configurations.
++ frequencies_path should be a path to a directory of alpha zones paired with frequencies for individual amino acid pairs (like rankings_path, except with all zones represented). Provide a file named "all.txt" inside the directory to define zone frequencies to be used for all amino acid pairs.
++ axisradii_path should be a path to a directory with two files, x.txt and y.txt. These two files should contain regression information for predicting the frequency of an axis being found at a certain distance from the reference amino acid: zone.x, zone.y, zone.z; vertex.x of quadratic regression; vertex.y; leading coefficient."""
super(FrequencyDistributionManager,self).__init__()
self.default_alpha_frequencies = {}
self.alpha_frequencies = {}
self.axis_regressions = {} #Format - key: alpha zone, value: [h1, k1, a1, h2, k2, a2] for quadratic model with 1 = x-axis, 2 = y-axis
self.axis_frequencies = [[{} for i in xrange(AMINO_ACID_COUNT)] for i in xrange(AMINO_ACID_COUNT)]
self.median_frequencies = [[0 for i in xrange(AMINO_ACID_COUNT)] for i in xrange(AMINO_ACID_COUNT)]
print "Loading..."
a = datetime.datetime.now()
self.load_rankings(rankings_path)
self.load_frequencies(frequencies_path)
self.load_axis_regressions(axisradii_path)
b = datetime.datetime.now()
print "Loaded in {1:.4f} sec.".format(self, (b - a).total_seconds())
self.weight = 1.0
self.type = frequency_disttype
self.defaultvalue = 0
self.refstate = False
def alpha_frequency(self, type1, type2, zone):
"""This helper function retrieves the frequency of 'zone' in the loaded frequency data. type1 refers to the type (code) of the amino acid that serves as the origin of the zone space, while type2 is the amino acid to which the zone refers."""
if zone in self.default_alpha_frequencies:
return self.default_alpha_frequencies[zone]
elif zone in self.alpha_frequencies:
return self.alpha_frequencies[zone][type1][type2]
else:
return 0
def score(self, protein, data, system=None):
"""For frequency distributions, pass in an array of hypothetical aminoacids."""
score = 0.0
consec = 2
if self.type == frequency_consec_disttype: consec = 1
elif self.type == frequency_nonconsec_disttype: consec = 0
for aa in data:
if not aa: continue
if system:
nearby = system.nearby_aa(aa, protein, 10.0, consec=consec)
else:
nearby = protein.nearby_aa(aa, 10.0, consec=consec)
for aa2 in nearby:
hypo = next((x for x in data if x and x.tag == aa2.tag), None)
if hypo is not None: aa2 = hypo
tag1 = aacode(aa.type)
tag2 = aacode(aa2.type)
if tag1 >= AMINO_ACID_COUNT: tag1 = 0
if tag2 >= AMINO_ACID_COUNT: tag2 = 0
# Determine the score based on the reference amino acid's perspectives and its neighbors'.
# Formula: S = -ln(F/F0) if F > 0, +5 otherwise
zone = aa2.tolocal(aa.acarbon).floor()
freq = self.alpha_frequency(tag2, tag1, zone)
if freq > 0:
subscore = -math.log(freq / self.median_frequencies[tag2][tag1])
else:
subscore = 5
score += subscore
zone = aa.tolocal(aa2.acarbon).floor()
freq = self.alpha_frequency(tag1, tag2, zone)
if freq > 0:
subscore = -math.log(freq / self.median_frequencies[tag1][tag2])
else:
subscore = 5
score += subscore
return score * self.weight
def load_frequencies(self, path):
files = os.listdir(path)
for indfile in files:
if indfile.find(".txt") == -1: continue
all = ("all" in indfile)
if not all: tag1, tag2 = indfile[0:-4].split('-')
tag1 = int(tag1)
tag2 = int(tag2)
with open(join(path, indfile), 'r') as file:
for line in file:
if ";" not in line and len(line.strip()) > 0:
self.median_frequencies[tag1][tag2] = float(line)
continue
ptcomps, freq = line.strip().split(";")
alpha = Point3D(*ptcomps.split(","))
if all:
self.default_alpha_frequencies[alpha] = float(freq.strip().split(" ")[0])
else:
if alpha not in self.alpha_frequencies:
self.alpha_frequencies[alpha] = [[0 for i in xrange(AMINO_ACID_COUNT)] for k in xrange(AMINO_ACID_COUNT)]
self.alpha_frequencies[alpha][tag1][tag2] = float(freq)
print "Loaded frequencies"
def load_rankings(self, path):
files = os.listdir(path)
for indfile in files:
if indfile.find(".txt") == -1: continue
(tag1, tag2) = indfile[0:-4].split('-')
tag1 = int(tag1)
tag2 = int(tag2)
#alpha = self.alpha_frequencies[tag1][tag2]
axis = self.axis_frequencies[tag1][tag2]
with open(join(path, indfile), 'r') as file:
current_alpha = None
#total_count = 0
for line in file:
if "(" not in line:
'''if len(line) > 1:
self.total_count[tag1][tag2] = int(line)
break
else:'''
continue
comps = line.translate(None, "()").split(";")
if "\t" not in line: # New alpha carbon zone
current_alpha = Point3D.list(comps[0].split(","))
#alpha[current_alpha] = int(comps[1])
axis[current_alpha] = {}
else:
x_axis = Point3D.list(comps[1].split(","))
y_axis = Point3D.list(comps[2].split(","))
z_axis = Point3D.list(comps[3].split(","))
axis[current_alpha][PositionZone(current_alpha, x_axis, y_axis, z_axis)] = int(comps[4])
print "Loaded rankings"
def load_axis_regressions(self, path):
if path is None:
print "No path specified for axis radius regressions."
return
files = os.listdir(path)
for indfile in files:
if indfile == "x.txt": startidx = 0
elif indfile == "y.txt": startidx = 3
else: continue
with open(join(path, indfile), 'r') as file:
for line in file:
alpha, h, k, a = line.strip().split(";")
alpha_zone = Point3D(*alpha.split(","))
if alpha_zone in self.axis_regressions:
self.axis_regressions[alpha_zone][startidx] = float(h)
self.axis_regressions[alpha_zone][startidx + 1] = float(k)
self.axis_regressions[alpha_zone][startidx + 1] = float(a)
else:
item = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
item[startidx] = float(h)
item[startidx + 1] = float(k)
item[startidx + 1] = float(a)
self.axis_regressions[alpha_zone] = item
print "Loaded axis regressions"
#MARK: - Medium
class MediumDistributionManager(DistributionManager):
'''Concrete subclass of DistributionManager that provides file-handling methods that parse medium amino acid density data (number of amino acids within 10 angstroms of each type).'''
def __init__(self, frequencies_path):
"""MediumDistributionManager maintains a dataset sourced from frequencies_path that contains the (scaled) frequency of each amino acid having a particular number of other amino acids around it. The format should be a directory of files, one for each amino acid type, with the following structure in each file (coords need not be consecutive):
1,(frequency of 1 amino acid)
2,(frequency of 2 amino acids)
.
.
.
64,(frequency of 64 amino acids)
67,(frequency of 67 amino acids)
(median frequency)"""
super(MediumDistributionManager,self).__init__()
self.weight = 1.0
self.type = medium_disttype
self.defaultvalue = 0
self.frequencies = [{} for i in xrange(AMINO_ACID_COUNT)]
self.median_frequencies = [0.0 for i in xrange(AMINO_ACID_COUNT)]
self.load_frequencies(frequencies_path)
self.identifier = os.path.basename(frequencies_path)
self.refstate = True
def load_frequencies(self, frequencies_path):
files = os.listdir(frequencies_path)
loading_indicator.add_loading_data(len(files))
for indfile in files:
loading_indicator.update_progress(1)
if indfile.find(".txt") == -1: continue
idx = int(indfile[0:-4])
with open(join(frequencies_path, indfile), 'r') as file:
for line in file:
line = line.strip()
if len(line) == 0: continue
if "," not in line and " " not in line:
if self.median_frequencies[idx] == 0.0:
self.median_frequencies[idx] = float(line)
else:
if "," in line: comps = line.split(",")
else: comps = line.split()
coord = int(comps[0])
self.frequencies[idx][coord] = float(comps[1])
for idx in xrange(AMINO_ACID_COUNT):
self.median_frequencies[idx] = sum(self.frequencies[idx])# / float(len([f for f in self.frequencies[idx] if f != 0]))
def score(self, protein, data, isolate=False, system=None, zero_value=0.01):
"""Pass in an array of amino acids for data."""
score = 0.0
volume = protein.volume()
if volume <= 4000.0 / 3.0 * math.pi:
return score
for aa in data:
if not aa: continue
tag = aacode(aa.type)
if tag >= AMINO_ACID_COUNT: tag = 0
if system:
nearby = system.nearby_aa(aa, protein, 10.0)
else:
nearby = protein.nearby_aa(aa, 10.0)
if isolate:
coord = len([x for x in nearby if next((y for y in data if y and y.tag == x.tag), None)])
else:
coord = len(nearby)
freqs = self.frequencies[tag]
if coord in freqs:
f = freqs[coord]
elif len(freqs) > 0:
try: less = max(i for i in freqs if freqs[i] <= coord)
except ValueError: less = -1
try: more = min(i for i in freqs if freqs[i] >= coord)
except ValueError: more = -1
if less != -1 and more != -1:
f = (freqs[more] - freqs[less]) / (more - less) * (coord - less) + freqs[less]
if less == -1 and more == -1:
f = 0.0
elif less == -1:
f = more
elif more == -1:
f = less
else:
f = 0.0
#f0 = self.median_frequencies[tag]
f = f / self.median_frequencies[tag]
if self.refstate:
N = len(protein.aminoacids)
f0 = ((1 - (4000 * math.pi) / (3 * volume)) ** (N - 1)) / (((3 * volume) / (4000 * math.pi) - 1) ** coord)
f0 *= float(math.factorial(N - 1) / (math.factorial(coord) * math.factorial(N - coord - 1)))
else:
f0 = self.median_frequencies[tag]
if f <= 0: f = zero_value
subscore = -math.log(f / f0)
score += subscore
return score * self.weight
#MARK: - Helper
def place_element_sorted(array, element, sorter=None, min=0, max=0):
"""Performs binary search to place the element in its proper place in the sorted array."""
if sorter is None:
def sort(a, b):
if a > b: return 1
elif a == b: return 0
else: return -1
sorter = sort
if max == 0: max = len(array)
if max - min == 2:
value = sorter(array[min], element)
if value >= 0: array.insert(min, element)
else:
value = sorter(array[min + 1], element)
if value >= 0: array.insert(min + 1, element)
else: array.insert(min + 2, element)
elif max - min == 1:
value = sorter(array[min], element)
if value >= 0: array.insert(min, element)
else: array.insert(min + 1, element)
elif max - min == 0:
array.append(element)
else:
mid = int(floor((max + min) / 2))
value = sorter(array[mid], element)
if value > 0: place_element_sorted(array, element, sorter=sorter, min=min, max=mid)
elif value == 0: array.insert(mid + 1, element)
else: place_element_sorted(array, element, sorter=sorter, min=mid, max=max)
def place_elements_sorted(array, elements, sorter=None):
"""Places elements in array so that the array is sorted according to 'sorter'. Assumes that array is already sorted."""
if sorter is None:
def sort(a, b):
if a > b: return 1
elif a == b: return 0
else: return -1
sorter = sort
for x in elements:
place_element_sorted(array, x, sorter=sorter)