-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
156 lines (123 loc) · 5.58 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# import os
# import pandas as pd
# import streamlit as st
# from PyPDF2 import PdfReader
# from langchain.text_splitter import CharacterTextSplitter
# from langchain.embeddings.openai import OpenAIEmbeddings
# from langchain.vectorstores import FAISS
# from langchain.chains.question_answering import load_qa_chain
# from langchain.llms import OpenAI
# from langchain.callbacks import get_openai_callback
# def main():
# st.set_page_config(page_title="Ask your PDF")
# st.header("Ask your PDF 💬")
# # upload file
# pdf = st.file_uploader("Upload your PDF", type="pdf")
# # extract the text
# if pdf is not None:
# pdf_reader = PdfReader(pdf)
# text = ""
# for page in pdf_reader.pages:
# text += page.extract_text()
# # split into chunks
# text_splitter = CharacterTextSplitter(
# separator="\n",
# chunk_size=1000,
# chunk_overlap=200,
# length_function=len
# )
# chunks = text_splitter.split_text(text)
# # create or load existing CSV file for the PDF
# csv_filename = os.path.splitext(pdf.name)[0] + ".csv"
# if os.path.exists(csv_filename):
# df = pd.read_csv(csv_filename)
# else:
# df = pd.DataFrame(columns=["Question", "Response"])
# # create embeddings
# embeddings = OpenAIEmbeddings()
# knowledge_base = FAISS.from_texts(chunks, embeddings)
# user_question = st.text_input("Ask a question about your PDF:")
# if user_question:
# docs = knowledge_base.similarity_search(user_question)
# llm = OpenAI()
# chain = load_qa_chain(llm, chain_type="stuff")
# with get_openai_callback() as cb:
# response = chain.run(input_documents=docs, question=user_question)
# print(cb)
# # check if the question is already present in the CSV
# if user_question not in df["Question"].values:
# new_data = {"Question": user_question, "Response": response}
# df = pd.concat([df, pd.DataFrame([new_data])], ignore_index=True)
# df.to_csv(csv_filename, index=False) # save the updated CSV file
# st.write(response)
# if __name__ == '__main__':
# main()
import os
import pandas as pd
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback
import time
def main():
st.set_page_config(page_title="Ask your PDF", layout="centered")
st.title("Ask your PDF 💬")
# Sidebar for upload functionality
st.sidebar.header("Upload")
pdf = st.sidebar.file_uploader("Upload PDF", type="pdf")
questions_csv = st.sidebar.file_uploader("Upload CSV with Questions", type="csv")
csv_submit = st.sidebar.button("Submit CSV", key="csv_submit")
# Main content area
response_container = st.container()
# Extract text from PDF and process questions
if pdf is not None:
pdf_reader = PdfReader(pdf)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
text_splitter = CharacterTextSplitter(
separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
)
chunks = text_splitter.split_text(text)
# Create or load existing CSV file for the PDF
csv_filename = os.path.splitext(pdf.name)[0] + ".csv"
if os.path.exists(csv_filename):
df = pd.read_csv(csv_filename)
else:
df = pd.DataFrame(columns=["Question", "Response"])
# Create embeddings and knowledge base
embeddings = OpenAIEmbeddings()
knowledge_base = FAISS.from_texts(chunks, embeddings)
st.sidebar.button("PDF Submitted", key="pdf_submitted", disabled=True)
# Process questions from CSV and generate responses
if questions_csv is not None and csv_submit:
questions_df = pd.read_csv(questions_csv)
for index, row in questions_df.iterrows():
user_question = row["Question"]
docs = knowledge_base.similarity_search(user_question)
llm = OpenAI()
chain = load_qa_chain(llm, chain_type="stuff")
with get_openai_callback() as cb:
response = chain.run(input_documents=docs, question=user_question)
with response_container:
st.write(f"### Question: {user_question}")
response_lines = response.split("\n")
for line in response_lines:
st.write(line)
time.sleep(0.5)
# Append response to responses dataframe
if user_question not in df["Question"].values:
new_data = {"Question": user_question, "Response": response}
df = pd.concat([df, pd.DataFrame([new_data])], ignore_index=True)
time.sleep(1)
# Save responses to CSV file
st.write("Saving responses to CSV...")
df.to_csv(csv_filename, index=False)
st.write("Responses saved successfully.")
st.sidebar.button("CSV Submitted", key="csv_submitted", disabled=True)
if __name__ == '__main__':
main()