-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions.py
84 lines (52 loc) · 2.79 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# Ami G. Sangster
import numpy as np
import ruptures as rpt
def get_protein_segments(emb_dict, max_bkps_per100aa=3):
"""
Define the boundaries of protein segments using change point analysis.
Parameters:
emb_dict: a dictionary of UniProt protein IDs as keys and ProtT5 per-residue embeddings as values
max_bkps_per100aa: an integer that represents the maximum number of boundaries per 100 amino acids in the protein
Returns:
protein_segments: a dictionary of Uniprot protein IDs as keys and a list of boundaries as values
"""
protein_segments = {}
# for each protein and embedding
for protein, emb in emb_dict.items():
n_boundaries = max(int(emb.shape[0]*max_bkps_per100aa/100),1)
# try to get the breakpoints from change point analysis
try:
alg = rpt.Window(width=30, model='rbf', jump=1).fit(emb)
# n_bkps is the maximum number of breakpoints allowed in this protein
boundaries = alg.predict(n_bkps=n_boundaries)
# convert boundaries to segments
my_segments = [[0,boundaries[0]]]
my_segments.extend([[boundaries[ii], boundaries[ii+1]] for ii in range(len(boundaries)-1)])
# this algorithm can fail if the maximum number of breakpoints is unreasonable
except:
print("Failed", protein, "number of boundaries", n_boundaries)
my_segments = "Failed"
protein_segments[protein] = my_segments
return protein_segments
def get_protein_segment_embeddings(emb_dict, protein_segment_boundaries):
"""
Calculates the segment embedding for each segment defined by the change point analysis
Parameters:
emb_dict: a dictionary of UniProt protein IDs as keys and ProtT5 per-residue embeddings as values
protein_segment_boundaries: a dictionary of Uniprot protein IDs as keys and a list of boundaries as values
Returns:
protein_segment_embeddings: a dictionary of "ID start-stop" as keys and segment embeddings as values
IDs are UniProt protein IDs, start and stop corresponds to the positions of the segment in the
original protein sequence, and segment embeddings are vectors of size 1x1024
"""
protein_segment_embeddings = {}
# for each protein with an embedding
for protein, emb in emb_dict.items():
# if the segmentation did not fail
if protein_segment_boundaries[protein] != "Failed":
# generate a "segment embedding" for each segment of the protein
for segment in protein_segment_boundaries[protein]:
seg_emb = np.mean(emb[segment[0]:segment[1],:], axis=0)
key = protein+" "+str(segment[0])+"-"+str(segment[1])
protein_segment_embeddings[key] = seg_emb
return protein_segment_embeddings