-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexport_results.py
executable file
·182 lines (150 loc) · 7.38 KB
/
export_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
"""
Copyright [2009-present] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import asyncio
import json
import os
import random
import string
import sqlalchemy as sa
from aiopg.sa import create_engine
from dotenv import load_dotenv
from database.models import Article, Result
load_dotenv()
async def create_json_file(annotations):
"""
Creates the Json file that will be used by Europe PMC
:param annotations: list of results
:return: none
"""
# random string to use in filename
random_char = "".join(random.choices(string.ascii_uppercase + string.digits, k=16))
with open("json_files/" + "RNAcentral_annotations_" + random_char, "w", encoding="utf-8") as outfile:
outfile.write("[" + ",\n".join(json.dumps(row, ensure_ascii=False) for row in annotations) + "]\n")
async def create_annotation(json_obj, exact, section, job_id, urs):
"""
Function to add annotation
:param json_obj: json object
:param exact: sentence
:param section: section from which the sentence was taken
:param job_id: identifiers (ids), gene names or synonyms
:param urs: URS related to the job_id
:return: new annotation
"""
return json_obj["anns"].append({
"exact": exact,
"section": section,
"tags": [
{
"name": job_id,
"uri": "https://rnacentral.org/rna/" + urs.upper()
}
]
})
async def export_results():
"""
Function to export results to Europe PMC.
Run it with: python3 export_results.py
"""
# get parameters
user = os.getenv("username")
database = os.getenv("db")
host = os.getenv("host")
password = os.getenv("pass")
# list of annotations
annotations = []
async with create_engine(user=user, database=database, host=host, password=password) as engine:
# get list of articles
query = (sa.select([Article.c.pmcid, Article.c.title]).select_from(Article).where(~Article.c.retracted)) # noqa
pmcid_list = []
async with engine.acquire() as connection:
async for row in connection.execute(query):
pmcid_list.append({"pmcid": row["pmcid"], "title": row["title"]})
for item in pmcid_list:
json_obj = {"src": "PMC", "id": item["pmcid"], "provider": "RNAcentral", "anns": []}
# get results
results = []
results_sql = (sa.select([Result.c.id, Result.c.job_id, Result.c.id_in_title, Result.c.id_in_abstract,
Result.c.id_in_body])
.select_from(Result)
.where(Result.c.pmcid == item["pmcid"])) # noqa
async for row in connection.execute(results_sql):
results.append({
"id": row["id"],
"job_id": row["job_id"],
"id_in_title": row["id_in_title"],
"id_in_abstract": row["id_in_abstract"],
"id_in_body": row["id_in_body"]
})
# remove duplicate sentences for similar job_ids.
# e.g. article PMC6157089 uses the following job_ids: 16s rrna, rrna, 16s
# in this case we want to export only 16s rrna
remove_jobs = []
if len(results) > 1:
job_list = [item["job_id"] for item in results]
# check if any job_id contains two or more words
check_jobs = False
for job in job_list:
if len(job.split()) > 1:
check_jobs = True
# list of job_ids which should not be used
if check_jobs:
tmp_list = list(job_list)
for job in job_list:
tmp_list.remove(job)
if any(job in jobs for jobs in tmp_list):
remove_jobs.append(job)
tmp_list = list(job_list)
# filter results
results = [item for item in results if item["job_id"] not in remove_jobs]
for result in results:
# get urs
urs_sql = sa.text(
'''SELECT primary_id FROM database
WHERE job_id=:job_id AND primary_id LIKE 'urs%' LIMIT 1'''
)
async for row in connection.execute(urs_sql, job_id=result["job_id"]):
result["urs"] = row.primary_id if row.primary_id else None
# add annotation in case the article title contains the job_id
if "urs" in result and result["id_in_title"]:
await create_annotation(json_obj, item["title"], "title", result["job_id"], result["urs"])
# add annotation if the abstract contains the job_id
if "urs" in result and result["id_in_abstract"]:
# get sentence in abstract
abstract_sql = sa.text(
'''SELECT sentence FROM abstract_sentence
WHERE result_id=:result_id ORDER BY length(sentence) DESC LIMIT 1'''
)
async for row in connection.execute(abstract_sql, result_id=result["id"]):
abs_sentence = row.sentence
if "found in an image, table or supplementary material" not in abs_sentence:
await create_annotation(json_obj, abs_sentence, "abstract", result["job_id"], result["urs"])
# add annotation if the body of the article contains the job_id
if "urs" in result and result["id_in_body"]:
# get sentence in body
body_sql = sa.text(
'''SELECT sentence FROM body_sentence
WHERE result_id=:result_id ORDER BY length(sentence) DESC LIMIT 1'''
)
async for row in connection.execute(body_sql, result_id=result["id"]):
body_sentence = row.sentence
if "found in an image, table or supplementary material" not in body_sentence:
await create_annotation(json_obj, body_sentence, "body", result["job_id"], result["urs"])
if json_obj["anns"]:
# add annotation
annotations.append(json_obj)
# every file must have less than 10000 rows, where each row represents an individual
# article with all associated annotations.
for i in range(0, len(annotations), 9999):
await create_json_file(annotations[i:i + 9999])
if __name__ == "__main__":
asyncio.run(export_results())