Skip to content

Commit 251fa7c

Browse files
committed
lots of stuff
1 parent 0c88e53 commit 251fa7c

24 files changed

+1084
-97
lines changed

Diff for: OLD_update_annotations.py

+246
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
#FLOW:
2+
# Given PDB ID:
3+
# Open Database and pull structure
4+
# Get Chain IDs
5+
# Get Entity ID
6+
# Get Uniprot ID
7+
#From here call a lot of code!
8+
# Jaspar
9+
# etc.
10+
# Get citation data
11+
# Add date last modified
12+
13+
14+
from pymongo import MongoClient
15+
import sys
16+
import json
17+
import requests
18+
from query_jaspar import getJasparLogo
19+
from getUniprot import getUniprot
20+
from get_citation_data import get_citation_data
21+
import time
22+
23+
connection_string = "mongodb://localhost:27017/"
24+
#NOTE: 1jgg doesn't map correctly
25+
26+
#chid = chain id
27+
def get_entity_id(chid, pdb_id):
28+
try:
29+
response = json.loads(requests.get("https://data.rcsb.org/rest/v1/core/polymer_entity_instance/{}/{}".format(pdb_id,
30+
chid)).text)
31+
# print(response)
32+
entity_id = json.dumps(response['rcsb_polymer_entity_instance_container_identifiers']['entity_id'],
33+
indent=4).replace("\"","")
34+
return entity_id
35+
except:
36+
raise Exception("Could not get entity ID")
37+
38+
def get_uniprot_id(entity_id, pdb_id):
39+
try:
40+
response = json.loads(requests.get("https://data.rcsb.org/rest/v1/core/uniprot/{}/{}".format(pdb_id,
41+
entity_id)).text)
42+
# print(response)
43+
uniprot_id = response[0]["rcsb_id"]
44+
return uniprot_id
45+
except:
46+
raise Exception("Could not get Uniprot ID")
47+
48+
def get_all_pdb_ids(client):
49+
db = client['dnaprodb2']
50+
collection = db['dna-protein']
51+
52+
# Initialize an empty list to hold all the structure_ids
53+
structure_ids = []
54+
55+
# Query the collection for all documents and only fetch the structure_id field
56+
for document in collection.find({}, {"structure_id": 1, "_id": 0}):
57+
# Check if the structure_id field exists in the document
58+
if 'structure_id' in document:
59+
# Add the structure_id to the list
60+
structure_ids.append(document['structure_id'])
61+
62+
# Close the MongoClient connection
63+
return structure_ids
64+
65+
# Wrap everything in try to catch errors
66+
def update_annotation(pdb_id, client):
67+
try:
68+
# client = MongoClient(connection_string)
69+
db = client['dnaprodb2']
70+
collection = db['dna-protein']
71+
made_change = False
72+
73+
list_of_ids = []
74+
id_uniprot_map = {} # maps dnaprodb chain ids to uniprot id
75+
76+
protein_ids = set()
77+
uniprot_dict = {}
78+
79+
# Replace 'your_structure_id' with the actual structure_id you're looking for
80+
document = collection.find_one({"structure_id": pdb_id})
81+
82+
if document is None:
83+
print(f"No entry found for structure_id: {pdb_id}")
84+
sys.exit(0) # Exit gracefully if the document is not found
85+
86+
if 'protein' in document:
87+
protein = document['protein']
88+
if 'chains' in protein:
89+
chains = protein['chains']
90+
for chain in chains:
91+
if 'id' in chain:
92+
list_of_ids.append(chain['id'])
93+
protein_ids.add(chain['id'])
94+
95+
else:
96+
print("Error: 'chains' list not found in the 'protein' object.")
97+
else:
98+
print("Error: 'protein' object not found in the document.")
99+
100+
if 'dna' in document:
101+
dna = document['dna']
102+
if 'chains' in dna:
103+
chains = dna['chains']
104+
for chain in chains:
105+
if 'id' in chain:
106+
list_of_ids.append(chain['id'])
107+
else:
108+
print("Error: 'chains' list not found in the 'dna' object.")
109+
else:
110+
print("Error: 'dna' object not found in the document.")
111+
112+
for id in list_of_ids:
113+
try:
114+
entity_id = get_entity_id(id, pdb_id)
115+
uniprot_id = get_uniprot_id(entity_id, pdb_id)
116+
117+
# print(id, uniprot_id)
118+
uniprot_dict[uniprot_id] = True
119+
120+
# id_uniprot_map[id] = uniprot_id
121+
122+
except Exception as e:
123+
continue
124+
# for search
125+
protein_names = []
126+
go_molecular_function_search = []
127+
organisms = []
128+
uniprot_ids = []
129+
130+
for uniprot_id in uniprot_dict:
131+
# Get Jaspar stuff!
132+
jaspar_path = ""
133+
organism = "N/A"
134+
go_terms_c = []
135+
go_terms_p = []
136+
go_terms_f = []
137+
protein_name = "N/A"
138+
go_ids=[]
139+
try:
140+
jaspar_path = getJasparLogo(uniprot_id)
141+
if jaspar_path == False:
142+
jaspar_path = ""
143+
144+
except Exception as e:
145+
print("Error getting jaspar logo!")
146+
147+
# Get Uniprot data
148+
try:
149+
organism, go_terms_c, go_terms_p, go_terms_f, protein_name, go_ids = getUniprot(uniprot_id)
150+
# print(organism)
151+
if not go_terms_c:
152+
go_terms_c = []
153+
if not go_terms_f:
154+
go_terms_f = []
155+
if not go_terms_p:
156+
go_terms_p = []
157+
except Exception as e:
158+
print("Error getting uniprot data")
159+
160+
uniprot_ids.append({'uniprot_accession': uniprot_id, 'go_ids': list(go_ids)})
161+
162+
uniprot_object = {}
163+
uniprot_object['jasparPath'] = jaspar_path
164+
uniprot_object['organism'] = organism
165+
uniprot_object['GO_cellular_component'] = [x[0] for x in go_terms_c]
166+
uniprot_object['GO_biological_process'] =[x[0] for x in go_terms_f]
167+
uniprot_object['GO_molecular_function'] = [x[0] for x in go_terms_p]
168+
169+
# update search
170+
for mol_fctn in go_terms_p:
171+
if mol_fctn not in go_molecular_function_search:
172+
go_molecular_function_search.append(mol_fctn)
173+
174+
175+
uniprot_object['protein_name'] = protein_name
176+
uniprot_dict[uniprot_id] = uniprot_object
177+
protein_names.append(protein_name)
178+
organisms.append(organism)
179+
search_dict = {}
180+
# check if uniprot_dict is empty and add dummy stuff
181+
if not uniprot_dict:
182+
search_dict['organism'] = '?'
183+
search_dict['uniprot_names'] = ['?']
184+
search_dict['GO_molecular_function'] = ['?']
185+
search_dict['organisms'] = '?'
186+
search_dict['go_ids'] = ['?']
187+
else:
188+
if not organisms:
189+
organisms = ['?']
190+
for val in uniprot_dict.values():
191+
search_dict['organism'] = val['organism']
192+
break
193+
if not go_ids:
194+
search_dict['go_ids'] = ['?']
195+
search_dict['GO_molecular_function'] = go_molecular_function_search
196+
search_dict['uniprot_names'] = protein_names
197+
search_dict['organisms'] = organisms
198+
search_dict['go_ids'] = list(go_ids)
199+
search_dict['uniprot_ids'] = uniprot_ids
200+
document['protein_metadata'] = uniprot_dict
201+
document['search'] = search_dict
202+
203+
citation_title, year, authors, doi, pubmed, method, keywords, release_date, title = get_citation_data(pdb_id)
204+
citation_data = {}
205+
citation_data['doi'] = doi
206+
citation_data['structure_title'] = title
207+
citation_data['release_data'] = release_date
208+
citation_data['year'] = year
209+
citation_data['exp_method'] = method
210+
citation_data['citation_title'] = citation_title
211+
citation_data['pubmed_id'] = pubmed
212+
citation_data['authors'] = authors
213+
citation_data['keywords'] = keywords
214+
215+
document['meta_data']['citation_data'] = citation_data
216+
collection.replace_one({"structure_id": pdb_id}, document)
217+
print(f"Document with structure_id {pdb_id} has been updated.")
218+
except Exception as e:
219+
print(f"An error occurred: {e}")
220+
221+
def update_all_annotations():
222+
try:
223+
client = MongoClient(connection_string)
224+
pdb_ids = get_all_pdb_ids(client) # Assume get_all_pdb_ids is modified to accept a client
225+
for pdb_id in pdb_ids:
226+
print("Updating", pdb_id)
227+
update_annotation(str(pdb_id), client)
228+
except Exception as e:
229+
print(f"An error occurred during batch update: {e}")
230+
finally:
231+
client.close()
232+
233+
def update_one_annotation(pdb_id):
234+
try:
235+
client = MongoClient(connection_string)
236+
print("Updating", pdb_id)
237+
update_annotation(str(pdb_id), client)
238+
except Exception as e:
239+
print(f"An error occurred during update: {e}")
240+
finally:
241+
client.close()
242+
243+
if __name__ == "__main__":
244+
# pdb_id = sys.argv[1]
245+
# update_annotation(pdb_id)
246+
update_all_annotations()

Diff for: add_structure_db.py

100644100755
+64-35
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,77 @@
11
from pymongo import MongoClient
22
import json
3+
import os
34
import sys
45

5-
# MongoDB connection string
6-
# Adjust the connection string if you have authentication enabled or if you're not running on the default port
7-
connection_string = "mongodb://localhost:27017/"
6+
def add_structure_db(filepath):
7+
# MongoDB connection string
8+
# Adjust the connection string if you have authentication enabled or if you're not running on the default port
9+
connection_string = "mongodb://localhost:27017/"
810

9-
# Path to your JSON file
10-
json_file_path = '/home/aricohen/Desktop/dnaprodb/{}.json'.format(sys.argv[1])
11+
# Path to your JSON file
12+
# json_file_path = '/home/aricohen/Desktop/dnaprodb/{}.json'.format(filepath)
13+
json_file_path = filepath
1114

12-
# Connect to MongoDB
13-
client = MongoClient(connection_string)
15+
# Connect to MongoDB
16+
client = MongoClient(connection_string)
1417

15-
# Select the database
16-
db = client['dnaprodb2']
18+
# Select the database
19+
db = client['dnaprodb2']
1720

18-
# Select the collection
19-
collection = db['dna-protein']
21+
# Select the collection
22+
collection = db['dna-protein']
2023

21-
# Load JSON data from the file
22-
with open(json_file_path, 'r') as file:
23-
data = json.load(file)
24+
# Load JSON data from the file
25+
with open(json_file_path, 'r') as file:
26+
data = json.load(file)
2427

25-
# Check if data is a list of documents or a single document
26-
if isinstance(data, list):
27-
# Upsert multiple documents
28-
for document in data:
29-
# Use structure_id as the unique identifier for upsert
30-
query = {"structure_id": document["structure_id"]}
31-
update = {"$set": document}
28+
# Check if data is a list of documents or a single document
29+
if isinstance(data, list):
30+
print("WEIRDWERIDWEIRD\nWEIRDWERIDWEIRD\nWEIRDWERIDWEIRD\nWEIRDWERIDWEIRD\n")
31+
else:
32+
# Upsert a single document
33+
if 'structure_id' not in data:
34+
print("ERROR. Structure ID not in document for path:", filepath)
35+
return filepath
36+
37+
structure_id = data["structure_id"]
38+
if 'external/PDB/pdb_entries/' in structure_id:
39+
structure_id = structure_id.split('external/PDB/pdb_entries/')[1]
40+
data['structure_id'] = structure_id
41+
42+
query = {"structure_id": data["structure_id"]}
43+
update = {"$set": data}
3244
result = collection.update_one(query, update, upsert=True)
3345
if result.matched_count > 0:
34-
print(f"Updated document with structure_id {document['structure_id']}.")
46+
print(f"Updated document with structure_id {data['structure_id']}.")
3547
elif result.upserted_id is not None:
36-
print(f"Inserted new document with structure_id {document['structure_id']}.")
37-
else:
38-
# Upsert a single document
39-
query = {"structure_id": data["structure_id"]}
40-
update = {"$set": data}
41-
result = collection.update_one(query, update, upsert=True)
42-
if result.matched_count > 0:
43-
print(f"Updated document with structure_id {data['structure_id']}.")
44-
elif result.upserted_id is not None:
45-
print(f"Inserted new document with structure_id {data['structure_id']}.")
46-
47-
# Close the connection
48-
client.close()
48+
print(f"Inserted new document with structure_id {data['structure_id']}.")
49+
50+
# Close the connection
51+
client.close()
52+
return "0"
53+
54+
def get_absolute_filepaths(directory):
55+
# List to store absolute file paths
56+
absolute_filepaths = []
57+
58+
# Walk through the directory
59+
for dirpath, dirnames, filenames in os.walk(directory):
60+
for filename in filenames:
61+
# Construct absolute path
62+
absolute_path = os.path.abspath(os.path.join(dirpath, filename))
63+
absolute_filepaths.append(absolute_path)
64+
65+
return absolute_filepaths
66+
67+
# if __name__ == '__main__':
68+
# with open('errors.txt', 'w') as my_file:
69+
# file_paths = get_absolute_filepaths('/home/aricohen/Desktop/dnaprodb/json/')
70+
# for path in file_paths:
71+
# output = add_structure_db(path)
72+
# if not (output == "0"):
73+
# my_file.write(output)
74+
75+
76+
if __name__ == '__main__':
77+
output = add_structure_db(sys.argv[1])

0 commit comments

Comments
 (0)