1
+ #FLOW:
2
+ # Given PDB ID:
3
+ # Open Database and pull structure
4
+ # Get Chain IDs
5
+ # Get Entity ID
6
+ # Get Uniprot ID
7
+ #From here call a lot of code!
8
+ # Jaspar
9
+ # etc.
10
+ # Get citation data
11
+ # Add date last modified
12
+
13
+
14
+ from pymongo import MongoClient
15
+ import sys
16
+ import json
17
+ import requests
18
+ from query_jaspar import getJasparLogo
19
+ from getUniprot import getUniprot
20
+ from get_citation_data import get_citation_data
21
+ import time
22
+
23
+ connection_string = "mongodb://localhost:27017/"
24
+ #NOTE: 1jgg doesn't map correctly
25
+
26
+ #chid = chain id
27
+ def get_entity_id (chid , pdb_id ):
28
+ try :
29
+ response = json .loads (requests .get ("https://data.rcsb.org/rest/v1/core/polymer_entity_instance/{}/{}" .format (pdb_id ,
30
+ chid )).text )
31
+ # print(response)
32
+ entity_id = json .dumps (response ['rcsb_polymer_entity_instance_container_identifiers' ]['entity_id' ],
33
+ indent = 4 ).replace ("\" " ,"" )
34
+ return entity_id
35
+ except :
36
+ raise Exception ("Could not get entity ID" )
37
+
38
+ def get_uniprot_id (entity_id , pdb_id ):
39
+ try :
40
+ response = json .loads (requests .get ("https://data.rcsb.org/rest/v1/core/uniprot/{}/{}" .format (pdb_id ,
41
+ entity_id )).text )
42
+ # print(response)
43
+ uniprot_id = response [0 ]["rcsb_id" ]
44
+ return uniprot_id
45
+ except :
46
+ raise Exception ("Could not get Uniprot ID" )
47
+
48
+ def get_all_pdb_ids (client ):
49
+ db = client ['dnaprodb2' ]
50
+ collection = db ['dna-protein' ]
51
+
52
+ # Initialize an empty list to hold all the structure_ids
53
+ structure_ids = []
54
+
55
+ # Query the collection for all documents and only fetch the structure_id field
56
+ for document in collection .find ({}, {"structure_id" : 1 , "_id" : 0 }):
57
+ # Check if the structure_id field exists in the document
58
+ if 'structure_id' in document :
59
+ # Add the structure_id to the list
60
+ structure_ids .append (document ['structure_id' ])
61
+
62
+ # Close the MongoClient connection
63
+ return structure_ids
64
+
65
+ # Wrap everything in try to catch errors
66
+ def update_annotation (pdb_id , client ):
67
+ try :
68
+ # client = MongoClient(connection_string)
69
+ db = client ['dnaprodb2' ]
70
+ collection = db ['dna-protein' ]
71
+ made_change = False
72
+
73
+ list_of_ids = []
74
+ id_uniprot_map = {} # maps dnaprodb chain ids to uniprot id
75
+
76
+ protein_ids = set ()
77
+ uniprot_dict = {}
78
+
79
+ # Replace 'your_structure_id' with the actual structure_id you're looking for
80
+ document = collection .find_one ({"structure_id" : pdb_id })
81
+
82
+ if document is None :
83
+ print (f"No entry found for structure_id: { pdb_id } " )
84
+ sys .exit (0 ) # Exit gracefully if the document is not found
85
+
86
+ if 'protein' in document :
87
+ protein = document ['protein' ]
88
+ if 'chains' in protein :
89
+ chains = protein ['chains' ]
90
+ for chain in chains :
91
+ if 'id' in chain :
92
+ list_of_ids .append (chain ['id' ])
93
+ protein_ids .add (chain ['id' ])
94
+
95
+ else :
96
+ print ("Error: 'chains' list not found in the 'protein' object." )
97
+ else :
98
+ print ("Error: 'protein' object not found in the document." )
99
+
100
+ if 'dna' in document :
101
+ dna = document ['dna' ]
102
+ if 'chains' in dna :
103
+ chains = dna ['chains' ]
104
+ for chain in chains :
105
+ if 'id' in chain :
106
+ list_of_ids .append (chain ['id' ])
107
+ else :
108
+ print ("Error: 'chains' list not found in the 'dna' object." )
109
+ else :
110
+ print ("Error: 'dna' object not found in the document." )
111
+
112
+ for id in list_of_ids :
113
+ try :
114
+ entity_id = get_entity_id (id , pdb_id )
115
+ uniprot_id = get_uniprot_id (entity_id , pdb_id )
116
+
117
+ # print(id, uniprot_id)
118
+ uniprot_dict [uniprot_id ] = True
119
+
120
+ # id_uniprot_map[id] = uniprot_id
121
+
122
+ except Exception as e :
123
+ continue
124
+ # for search
125
+ protein_names = []
126
+ go_molecular_function_search = []
127
+ organisms = []
128
+ uniprot_ids = []
129
+
130
+ for uniprot_id in uniprot_dict :
131
+ # Get Jaspar stuff!
132
+ jaspar_path = ""
133
+ organism = "N/A"
134
+ go_terms_c = []
135
+ go_terms_p = []
136
+ go_terms_f = []
137
+ protein_name = "N/A"
138
+ go_ids = []
139
+ try :
140
+ jaspar_path = getJasparLogo (uniprot_id )
141
+ if jaspar_path == False :
142
+ jaspar_path = ""
143
+
144
+ except Exception as e :
145
+ print ("Error getting jaspar logo!" )
146
+
147
+ # Get Uniprot data
148
+ try :
149
+ organism , go_terms_c , go_terms_p , go_terms_f , protein_name , go_ids = getUniprot (uniprot_id )
150
+ # print(organism)
151
+ if not go_terms_c :
152
+ go_terms_c = []
153
+ if not go_terms_f :
154
+ go_terms_f = []
155
+ if not go_terms_p :
156
+ go_terms_p = []
157
+ except Exception as e :
158
+ print ("Error getting uniprot data" )
159
+
160
+ uniprot_ids .append ({'uniprot_accession' : uniprot_id , 'go_ids' : list (go_ids )})
161
+
162
+ uniprot_object = {}
163
+ uniprot_object ['jasparPath' ] = jaspar_path
164
+ uniprot_object ['organism' ] = organism
165
+ uniprot_object ['GO_cellular_component' ] = [x [0 ] for x in go_terms_c ]
166
+ uniprot_object ['GO_biological_process' ] = [x [0 ] for x in go_terms_f ]
167
+ uniprot_object ['GO_molecular_function' ] = [x [0 ] for x in go_terms_p ]
168
+
169
+ # update search
170
+ for mol_fctn in go_terms_p :
171
+ if mol_fctn not in go_molecular_function_search :
172
+ go_molecular_function_search .append (mol_fctn )
173
+
174
+
175
+ uniprot_object ['protein_name' ] = protein_name
176
+ uniprot_dict [uniprot_id ] = uniprot_object
177
+ protein_names .append (protein_name )
178
+ organisms .append (organism )
179
+ search_dict = {}
180
+ # check if uniprot_dict is empty and add dummy stuff
181
+ if not uniprot_dict :
182
+ search_dict ['organism' ] = '?'
183
+ search_dict ['uniprot_names' ] = ['?' ]
184
+ search_dict ['GO_molecular_function' ] = ['?' ]
185
+ search_dict ['organisms' ] = '?'
186
+ search_dict ['go_ids' ] = ['?' ]
187
+ else :
188
+ if not organisms :
189
+ organisms = ['?' ]
190
+ for val in uniprot_dict .values ():
191
+ search_dict ['organism' ] = val ['organism' ]
192
+ break
193
+ if not go_ids :
194
+ search_dict ['go_ids' ] = ['?' ]
195
+ search_dict ['GO_molecular_function' ] = go_molecular_function_search
196
+ search_dict ['uniprot_names' ] = protein_names
197
+ search_dict ['organisms' ] = organisms
198
+ search_dict ['go_ids' ] = list (go_ids )
199
+ search_dict ['uniprot_ids' ] = uniprot_ids
200
+ document ['protein_metadata' ] = uniprot_dict
201
+ document ['search' ] = search_dict
202
+
203
+ citation_title , year , authors , doi , pubmed , method , keywords , release_date , title = get_citation_data (pdb_id )
204
+ citation_data = {}
205
+ citation_data ['doi' ] = doi
206
+ citation_data ['structure_title' ] = title
207
+ citation_data ['release_data' ] = release_date
208
+ citation_data ['year' ] = year
209
+ citation_data ['exp_method' ] = method
210
+ citation_data ['citation_title' ] = citation_title
211
+ citation_data ['pubmed_id' ] = pubmed
212
+ citation_data ['authors' ] = authors
213
+ citation_data ['keywords' ] = keywords
214
+
215
+ document ['meta_data' ]['citation_data' ] = citation_data
216
+ collection .replace_one ({"structure_id" : pdb_id }, document )
217
+ print (f"Document with structure_id { pdb_id } has been updated." )
218
+ except Exception as e :
219
+ print (f"An error occurred: { e } " )
220
+
221
+ def update_all_annotations ():
222
+ try :
223
+ client = MongoClient (connection_string )
224
+ pdb_ids = get_all_pdb_ids (client ) # Assume get_all_pdb_ids is modified to accept a client
225
+ for pdb_id in pdb_ids :
226
+ print ("Updating" , pdb_id )
227
+ update_annotation (str (pdb_id ), client )
228
+ except Exception as e :
229
+ print (f"An error occurred during batch update: { e } " )
230
+ finally :
231
+ client .close ()
232
+
233
+ def update_one_annotation (pdb_id ):
234
+ try :
235
+ client = MongoClient (connection_string )
236
+ print ("Updating" , pdb_id )
237
+ update_annotation (str (pdb_id ), client )
238
+ except Exception as e :
239
+ print (f"An error occurred during update: { e } " )
240
+ finally :
241
+ client .close ()
242
+
243
+ if __name__ == "__main__" :
244
+ # pdb_id = sys.argv[1]
245
+ # update_annotation(pdb_id)
246
+ update_all_annotations ()
0 commit comments