-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathexample.py
361 lines (282 loc) · 10.8 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
import akasha
import akasha.eval as eval
import akasha.summary as summary
import akasha.prompts as prompts
query1 = "五軸是甚麼?"
query2 = [
"西門子自有工廠如何朝工業4.0 發展",
"詳細解釋「工業4.0 成熟度指數」發展路徑的六個成熟度",
"根據西門子自有工廠朝工業4.0發展,探討其各項工業4.0的成熟度指標",
]
install_requires = [
"pypdf",
"langchain>=0.1.0",
"chromadb==0.4.14",
"openai==0.27",
"tiktoken",
"lark==1.1.7",
"scikit-learn<1.3.0",
"jieba==0.42.1",
"sentence-transformers==2.2.2",
"torch==2.0.1",
"transformers>=4.33.4",
"llama-cpp-python==0.2.6",
"auto-gptq==0.3.1",
"tqdm==4.65.0",
"docx2txt==0.8",
"rouge==1.0.1",
"rouge-chinese==1.0.3",
"bert-score==0.3.13",
"click",
"tokenizers>=0.13.3",
"streamlit==1.28.2",
"streamlit_option_menu==0.3.6",
]
## used for custom search type, use query_embeds and docs_embeds to determine relevant documents.
# you can add your own variables using log to record, for example log['dd'] = "miao"
def test_search(query_embeds, docs_embeds, k: int, relevancy_threshold: float,
log: dict):
from scipy.spatial.distance import euclidean
import numpy as np
distance = [[euclidean(query_embeds, docs_embeds[idx]), idx]
for idx in range(len(docs_embeds))]
distance = sorted(distance, key=lambda x: x[0])
## change dist if embeddings not between 0~1
max_dist = 1
while max_dist < distance[-1][0]:
max_dist *= 10
relevancy_threshold *= 10
## add log para
log["dd"] = "miao"
return [
idx for dist, idx in distance[:k]
if (max_dist - dist) >= relevancy_threshold
]
def test_model(prompt: str):
import openai
from langchain.chat_models import ChatOpenAI
openai.api_type = "open_ai"
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
ret = model.predict(prompt)
return ret
def test_embed(texts: list) -> list:
from sentence_transformers import SentenceTransformer
mdl = SentenceTransformer("BAAI/bge-large-zh-v1.5")
embeds = mdl.encode(texts, normalize_embeddings=True)
return embeds
def QA(doc_path="./docs/mic/"):
qa = akasha.Doc_QA(verbose=False, search_type="svm")
qa.get_response(
doc_path=doc_path,
prompt=query1,
chunk_size=500,
record_exp="",
search_type=test_search,
max_input_tokens=3000,
system_prompt="請用中文回答",
)
print(qa.response)
response_list = qa.chain_of_thought(doc_path=doc_path, prompt_list=query2)
print(response_list)
## ask_whole_file
response = qa.ask_whole_file(
file_path="docs/mic/20230317_5軸工具機因應市場訴求改變的發展態勢.pdf",
prompt=f'''五軸是甚麼?''',
model="openai:gpt-3.5-turbo-16k",
)
print(response)
## ask_self
qa.ask_self(model="openai:gpt-4",
max_input_tokens=10000,
prompt="langchain的套件版本?",
info=install_requires)
print(qa.response)
return qa
### logs ###
def QA_log(doc_path="./docs/mic/"):
# set keep_logs=True to keep logs
qa = akasha.Doc_QA(verbose=False, search_type="svm", keep_logs=True)
qa.get_response(
doc_path=doc_path,
prompt=query1,
chunk_size=500,
max_input_tokens=3000,
)
print(qa.response)
### you can use qa.logs to get the log of each run, logs is a dictonary, for each run, the key is timestamp and the log of that run is the value.
# use qa.timestamp_list to get the timestamp of each run, so you can get the log of each run by using it as key.
timestamps = qa.timestamp_list
print(qa.logs[timestamps[-1]])
### stream output ###
def QA_stream(doc_path="./docs/mic/"):
# set stream=True to get stream output
qa = akasha.Doc_QA(verbose=False, search_type="auto", stream=True)
streaming = qa.get_response(
doc_path=doc_path,
prompt=query1,
chunk_size=500,
max_input_tokens=3000,
)
### the output will be generator, you can use for loop to get each output ###
for s in streaming:
print(s)
### pass model ###
def QA_pass_model(doc_path="./docs/mic/"):
### You can pass the model and embedding objects into Doc_QA to avoid creating them repeatedly. ###
model_obj = akasha.handle_model("openai:gpt-3.5-turbo", False, 0.0)
### You can pass the certain .env file into handle_model if you want to use different api key. ###
# model_obj = akasha.handle_model("openai:gpt-3.5-turbo", False, 0.0, ".env2")
emb_obj = akasha.handle_embeddings()
qa = akasha.Doc_QA(model=model_obj, embeddings=emb_obj, search_type="auto")
### You can also create a db object and pass it into Doc_QA to avoid creating and load chromadb repeatedly. ###
db = akasha.createDB_directory(doc_path, emb_obj, ignore_check=True)
qa.get_response(
doc_path=db,
prompt=query1,
chunk_size=500,
max_input_tokens=3000,
)
print(qa.response)
def QA_prompt_format(doc_path="./docs/mic/"):
# choose prompt_format_type to make response more accurate #
# option: gpt, chat_mistral, chat_gpt, llama
qa = akasha.Doc_QA(
prompt_format_type="chat_gpt",
verbose=False,
search_type="svm",
)
qa.get_response(
doc_path=doc_path,
prompt=query1,
chunk_size=500,
max_input_tokens=3000,
)
print(qa.response)
### assign .env file ###
def assign_env(doc_path="./docs/mic/"):
### You can pass the .env file name into Doc_QA to assign certain API KEY. ###
model_obj = akasha.handle_model("openai:gpt-3.5-turbo",
False,
0.0,
env_file=".env")
print(akasha.call_model(model_obj, "五軸是甚麼?"))
#######################
# #
# #
######## EVAL #########
# #
# #
#######################
### remember the question_style must match the q_file's type
def EVAL(doc_path: str = "./docs/mic/"):
eva = eval.Model_Eval(question_style="single_choice",
question_type="fact",
verbose=True)
b, c = eva.auto_create_questionset(
doc_path=doc_path,
question_num=2,
question_style="single_choice",
chunk_size=850,
output_file_path="questionset/mic_single_choice.txt",
)
# b,c = eva.auto_create_questionset(doc_path=doc_path, question_num=2, question_type="essay")
print(
eva.auto_evaluation(
questionset_file="questionset/mic_single_choice.txt",
doc_path=doc_path,
verbose=True,
))
# print(eva.__dict__)
# eva.optimum_combination(q_file="questionset/mic_15.txt", doc_path=doc_path, chunk_size_list=[500]\
# ,search_type_list=[test_search,"svm","tfidf"])
return eva
# eval_obj = EVAL()
#eval_obj.save_logs("./eva.json")
#######################
# #
# #
####### SUMMARY #######
# #
# #
#######################
def SUM(file_name: str = "./docs/mic/20230531_智慧製造需求下之邊緣運算與新興通訊發展分析.pdf"):
sum = summary.Summary(
chunk_size=1000,
chunk_overlap=40,
model="openai:gpt-3.5-turbo",
verbose=False,
threshold=0.2,
language="ch",
record_exp="1234test",
format_prompt="請你在回答前面加上喵",
system_prompt="用中文回答",
max_input_tokens=3000,
temperature=0.0,
)
sum.summarize_file(file_path=file_name,
summary_type="map_reduce",
summary_len=100,
verbose=True)
print(sum.logs)
return sum
# summary_obj = SUM()
# summary_obj.save_logs("./summary_logs.json")
# summary_obj.save_logs(file_type="txt")
### VISION ###
## need to use gpt-4-vision ##
# ret = akasha.openai_vision(pic_path=["C:/Users/Pictures/oruya.png"], prompt="這張圖是甚麼意思?")
# print(ret)
### CALL TRANSLATOR ###
def TRANSLATOR():
ak = akasha.Doc_QA(verbose=False, search_type="auto")
response = ak.get_response(doc_path="docs/mic/", prompt="五軸是什麼?")
translated_response = akasha.helper.call_translator(ak.model_obj,
response,
language="en")
### JSON FORMATTER ###
def JSON():
ak = akasha.Doc_QA(
threshold=0.0,
verbose=True,
)
response = ak.ask_whole_file(file_path="docs/resume_pool/A.docx",
prompt=f'''以上是受試者的履歷,請回答該受試者的學歷、經驗、專長、年資''')
formatted_response = akasha.helper.call_JSON_formatter(
ak.model_obj, response, keys=["學歷", "經驗", "專長", "年資"])
print(formatted_response, type(formatted_response))
#
### agent ###
def input_func(question: str):
response = input(question)
return str({"question": question, "answer": response})
def agent_example1():
input_tool = akasha.create_tool(
"user_question_tool",
"This is the tool to ask user question, the only one param question is the question string that has not been answered and we want to ask user.",
func=input_func)
ao = akasha.agent(verbose=True,
tools=[
input_tool,
akasha.get_saveJSON_tool(),
],
model="openai:gpt-3.5-turbo")
print(
ao("逐個詢問使用者以下問題,若所有問題都回答了,則將所有問題和回答儲存成default.json並結束。問題為:1.房間燈關了嗎? \n2. 有沒有人在家? \n3.有哪些電器開啟?\n"
))
def agent_example2():
ao = akasha.agent(tools=[
akasha.get_wiki_tool(),
akasha.get_saveJSON_tool(),
], )
print(ao("請用中文回答李遠哲跟馬英九誰比較老?將查到的資訊和答案儲存成json檔案,檔名為AGE.json"))
ao.save_logs("ao2.json")
def ask_agent():
ak = akasha.Doc_QA(verbose=True, search_type="auto")
ak.ask_agent(doc_path="./docs/mic/", prompt="LPWAN和5G的區別是什麼?")
print("follow_up: \n\n", ak.follow_up,
ak.intermediate_ans) ## follow_up = ['LPWAN是什麼?', '5G是什麼?']
response, new_follow_up, new_intermediate_ans = ak.rerun_ask_agent(
doc_path="./docs/mic/",
prompt="LPWAN和5G的區別是什麼?",
follow_up=['LPWAN是什麼?', '5G是什麼?', '物聯網環境的無線通訊技術有哪些?'])
print(response, new_follow_up, new_intermediate_ans)