-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
121 lines (109 loc) · 4.1 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# app.py
import streamlit as st
from dotenv import load_dotenv
import pickle
from pypdf import PdfReader
from streamlit_extras.add_vertical_space import add_vertical_space
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
# get callback to get stats on query cost
from langchain.callbacks import get_openai_callback
import os
load_dotenv()
# Sidebar contents
with st.sidebar:
st.title('💬 PDF Chat App')
st.markdown('''
## About
This app is an LLM-powered PDF chatbot built using:
- [Streamlit](https://streamlit.io/) Frontend Framework
- [LangChain](https://python.langchain.com/) App Framework
- [OpenAI](https://platform.openai.com/docs/models) LLM model
- [FAISS](https://github.com/facebookresearch/faiss) vector store
''')
add_vertical_space(5)
st.write('Made by [William Gillett](https://github.com/wmgillett/chat-pdf-langchain-faiss-streamlit)')
def main():
st.write("Hello")
st.header("Chat with PDF 💬")
# upload a PDF file
pdf = upload_pdf()
# check for pdf file
if pdf is not None:
# process text in pdf and convert to chunks
chuck_size = 500
chuck_overlap = 100
chunks = process_text(pdf, chuck_size, chuck_overlap)
vector_store = get_embeddings(chunks, pdf)
# ask the user for a question
question = st.text_input("Ask a question")
if question:
# get the docs related to the question
docs = retrieve_docs(question, vector_store)
response = generate_response(docs, question)
st.write(response)
# upload a pdf file from website
def upload_pdf():
pdf = st.file_uploader("Upload your PDF", type="pdf")
return pdf
# convert the pdf to text chunks
def process_text(pdf, chuck_size, chuck_overlap):
pdf_reader = PdfReader(pdf)
# extract the text from the PDF
page_text = ""
for page in pdf_reader.pages:
page_text += page.extract_text()
# split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chuck_size,
chunk_overlap=chuck_overlap,
length_function=len
)
chunks = text_splitter.split_text(text=page_text)
if chunks:
return chunks
else:
raise ValueError("Could not process text in PDF")
# find or create the embeddings
def get_embeddings(chunks, pdf):
store_name = pdf.name[:-4]
# check if vector store already exists
# if REUSE_PKL_STORE is True, then load the vector store from disk if it exists
reuse_pkl_store = os.getenv("REUSE_PKL_STORE")
if reuse_pkl_store == "True" and os.path.exists(f"{store_name}.pkl"):
with open(f"{store_name}.pkl", "rb") as f:
vector_store = pickle.load(f)
st.write("Embeddings loaded from disk")
#else create embeddings and save to disk
else:
embeddings = OpenAIEmbeddings()
# create vector store to hold the embeddings
vector_store = FAISS.from_texts(chunks, embedding=embeddings)
# save the vector store to disk
with open(f"{store_name}.pkl", "wb") as f:
pickle.dump(vector_store, f)
st.write("Embeddings saved to disk")
if vector_store is not None:
return vector_store
else:
raise ValueError("Issue creating and saving vector store")
# retrieve the docs related to the question
def retrieve_docs(question, vector_store):
docs = vector_store.similarity_search(question, k=3)
if len(docs) == 0:
raise Exception("No documents found")
else:
return docs
# generate the response
def generate_response(docs, question):
llm = ChatOpenAI(temperature=0.0, max_tokens=1000, model_name="gpt-3.5-turbo")
chain = load_qa_chain(llm=llm, chain_type="stuff")
with get_openai_callback() as cb:
response = chain.run(input_documents=docs, question=question)
print(cb)
return response
if __name__ == '__main__':
main()