-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathagent_app.py
209 lines (173 loc) · 11.7 KB
/
agent_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import json
import os
from tempfile import NamedTemporaryFile
import streamlit as st
from dotenv import load_dotenv
from graph_processing.image_extracting import pdf_analysis
from langchain.agents import AgentType, initialize_agent, tool
from langchain.callbacks import StreamlitCallbackHandler
from langchain_openai import ChatOpenAI
from data_preproccessing.pdf2txt import extract_text_from_pdf
from logger import LOGGER
from utils import merge_dictionaries
load_dotenv(override=True)
LOGGER.info(f'OPENAI_API_KEY = "{os.getenv("OPENAI_API_KEY")}"')
st.set_page_config(page_title="Nanozyme AI Assistant")
st.header("Nanozyme AI Assistant")
st.session_state["main_container"] = st.container()
if "files" not in st.session_state:
st.session_state["files"] = None
if "united_params" not in st.session_state:
st.session_state["united_params"] = None
if "article_file" not in st.session_state and "supplement_file" not in st.session_state:
st.session_state["article_file"] = None
st.session_state["supplement_file"] = None
if prompt := st.chat_input():
try:
if st.session_state["article_text"]:
st.session_state["main_container"] = st.container()
with st.session_state["main_container"]:
st.chat_message("user").write(prompt)
try:
st.session_state["streamlit_callback_container"] = st.container()
st_callback = StreamlitCallbackHandler(
st.session_state["streamlit_callback_container"]
)
response = st.session_state["agent"].run(
prompt, callbacks=[st_callback]
)
st.chat_message("assistant").write(response)
except Exception as e:
LOGGER.error(e)
else:
st.error("Agent is not initialized")
except Exception as e:
st.error("You should upload an article first")
LOGGER.error(e)
if st.session_state["files"] is None:
st.info("Upload an Article to init Agent")
files = st.file_uploader(
"Upload an Article to init Agent", type=["pdf"], accept_multiple_files=True
)
if files != [] and st.session_state["files"] != files:
LOGGER.info("File upload start")
st.session_state["files"] = files
LOGGER.info(files)
article_file = None
supplement_file = None
if len(files) > 2:
st.error(
f"You should upload only 1 article and supplement information. Continue with only {files[0].name} file"
)
article_file = files[0]
elif len(files) == 2:
if files[0].name[-7:] == "_si.pdf":
supplement_file = files[0]
article_file = files[1]
elif files[1].name[-7:] == "_si.pdf":
supplement_file = files[1]
article_file = files[0]
else:
st.error(
f"Can't find suplement. Be sure that one of your files has _si.pdf suffix. Continue with only {files[0].name} file"
)
article_file = files[0]
else:
article_file = files[0]
supplement_file = None
st.session_state["article_file"] = article_file
st.session_state["supplement_file"] = supplement_file
files_dict = {
"article": st.session_state["article_file"].name,
"supplement": (
st.session_state["supplement_file"].name
if st.session_state["supplement_file"]
else None
),
}
st.info("Uploaded information:\n```json\n" + str(files_dict) + "\n```")
st.session_state["article_text"] = extract_text_from_pdf(article_file)
st.session_state["supplement_text"] = (
extract_text_from_pdf(supplement_file) if supplement_file else None
)
@tool("get_full_text")
def get_full_text(query: str) -> str:
"Returns full text of the article and supplement information if provided."
full_text_dict = {}
full_text_dict["article_text"] = st.session_state["article_text"]
full_text_dict["supplement_text"] = st.session_state["supplement_text"]
return "```json\n" + str(full_text_dict) + "\n```"
@tool("analyze_images")
def analyze_images(file_name: str) -> str:
"Extracts the minimum (Cmin) and maximum (Cmax) substrate concentrations from kinetic data on pages including pictures and graphs. It uses GPT-4 to analyze images. Returns Cmin and Cmax for each page of the article and supplementary if provided. You should always use this tool because graphs and pictures may contain useful information and parameters."
images_dict = {}
with NamedTemporaryFile(dir=".", suffix=".pdf", delete=False) as f:
f.write(st.session_state["article_file"].getbuffer())
images_dict["article"] = pdf_analysis(f.name)
os.remove(f.name)
if st.session_state["supplement_file"] is not None:
with NamedTemporaryFile(dir=".", suffix=".pdf", delete=False) as f:
f.write(st.session_state["supplement_file"].getbuffer())
images_dict["supplement"] = pdf_analysis(f.name)
os.remove(f.name)
return "```json\n" + str(images_dict) + "\n```"
@tool("llm_extractor")
def llm_extractor(file_name: str) -> str:
"Extracts Named Entities from article with the help of mistral-7b and llama3-8b. Can be useful when you need to get parameters of the experiments. WARNING: this tool does not separate parameters by experiment and may miss some important parameters. You should use this tool first, and then be sure to check everything and fix it with the full text."
article_name = st.session_state["article_file"].name[:-4]
mistral_article_dict = {}
mistral_si_dict = {}
if os.path.exists("./mistral_json"):
with open(
f"./mistral_json/{article_name}_mistral.json", "r", encoding="utf-8"
) as file:
mistral_article_dict = json.load(file)
with open(
f"./mistral_json/{article_name}_si_mistral.json", "r", encoding="utf-8"
) as file:
mistral_si_dict = json.load(file)
mistral_dict = merge_dictionaries([mistral_article_dict, mistral_si_dict])
llama_article_dict = {}
llama_si_dict = {}
if os.path.exists("./llama_json"):
with open(
f"./llama_json/{article_name}_lama.json", "r", encoding="utf-8"
) as file:
llama_article_dict = json.load(file)
with open(
f"./llama_json/{article_name}_si_lama.json", "r", encoding="utf-8"
) as file:
llama_si_dict = json.load(file)
llama_dict = merge_dictionaries([llama_article_dict, llama_si_dict])
return (
"```"
+ "\nmistral-7b answer:\n"
+ str(mistral_dict)
+ "\nllama3-8b answer:\n"
+ str(llama_dict)
+ "\n```"
)
tools = [get_full_text, analyze_images]
if os.path.exists("./llama_json") or os.path.exists("./mistral_json"):
tools.append(llm_extractor)
agent_llm = ChatOpenAI(
temperature=0,
model="gpt-4o",
streaming=True,
openai_api_key=os.getenv("OPENAI_API_KEY"),
)
prompt = """
You are helpful assistant in chemistry, specializing in nanozymes. Your task is to analyze scientific articles and extract detailed information about various experiments with nanozymes. It is crucial for you to accurately and comprehensively describe each experiment separately, without referring to other experiments in the article.
Usually, the articles contain several experiments with nanozymes with different parameters, such as formula, activity (usually peroxidase, oxidase, catalase or laccase), syngony (usually cubic, hexagonal, tetragonal, monoclinic, orthorhombic, trigonal, amorphous or triclinic), length, width and depth (or just size or diameter), surface chemistry (naked by default or poly(ethylene oxide), poly(N-Vinylpyrrolidone), Tetrakis(4-carboxyphenyl)porphine or other), polymer used in synthesis (none or poly(N-Vinylpyrrolidone), oleic acid, poly(ethylene oxide), BSA or other), surfactant (none or l-ascorbic acid, ethylene glycol, sodium citrate, cetrimonium bromide, citric acid, trisodium citrate, ascorbic acid or other), molar mass, Michaelis constant Km, molar maximum reaction rate Vmax, reaction type (substrat + co-substrat) (TMB + H2O2, H2O2 + TMB, TMB, ABTS + H2O2, H2O2, OPD + H2O2, H2O2 + GSH or other), minimum concentration of the substrate when measuring catalytic activity C_min (mM), maximum concentration of the substrate when measuring catalytic activity C_max (mM), concentration of the co-substrate when measuring the catalytic activity (mM), concentration of nanoparticles in the measurement of catalytic (mg/mL), pH at which the catalytic activity was measured and temperature at which the research was carried out (°C). You need to find all the experiments with different values mentioned in the article and write about each of them separately. It's imperative that each of these elements is addressed independently for every experiment, providing a complete and isolated description with accurate measurements in appropriate units. This approach will ensure a comprehensive and clear understanding of each experiment as an individual entity within the scientific literature on nanozymes.
You should describe the experiments in the article separately in words, while keeping the numerical values in the right units of measurement. It is critically important to extract all the numerical values as in the example, especially important are formula, activity, syngony, length, width, depth (size or diameter), Km, Vmax, reaction type. Usually such parameters as Michaelis constant Km (mM), Vmax, mM/s are obtained in two experiments for every type of nanoparticle. You must determine what type of reaction such parameters as Michaelis constant Km (mM), Vmax, mM/s belong to. Reaction type is H2O2+TMB when H2O2 is a substrate and TMB in co-cubstrate. Reaction type is TMB +H2O2 when TMB is a substrate and H2O2 in co-cubstrate.For example in pair H2O2 and TMB in first case (you call this case as Reaction type TMB+H2O2) H2O2 plays role as a co-substrate with constant concentration(C(const), mM) and TMB plays role as a substrate with concentrations from Cmin,mM to Cmax, mM. In second case (you call this case as Reaction type H2O2+)TMB) TMB plays role as a co-substrate with constant concentration(C(const), mM) and H2O2 plays role as a substrate with concentrations from Cmin,mM to Cmax, mM. Please divide all the data into 2 tracks: where H2O2 was a substrate and its concentration varied and where H2O2 was a co-substrate and had a constant concentration. Please show data only for those nanoparticles for which the kinetic assay was performed. All other parameters from the example are also important.
To answer questions, you can use the following tools: get_full_text, analyze_images, llm_extractor. If you have not managed to obtain the information you needed to answer with the help of the tools, mention that in your final answer. Do not attempt to use knowledge you already have. Don't use tools if you don't need them. After you receive a response from the tool, be sure to write your thoughts. Answer in the language that the question is asked in. Before writing the final answer, don't forget to write 'Final answer:'. You need to write this before you write the text of the final answer.
"""
st.session_state["agent"] = initialize_agent(
tools,
agent_llm,
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
verbose=True,
agent_kwargs={"prefix": prompt, "seed": 42},
)
st.toast("Successful Agent initialization")
LOGGER.info("Successful Agent initialization")