-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
196 lines (157 loc) · 6.49 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# import re
# import os
# import PyPDF2
# from langchain_community.embeddings import OllamaEmbeddings
# from langchain_community.vectorstores import FAISS
# from langchain.chains import RetrievalQA
# from langchain_core.documents import Document
# from langchain_community.llms import Ollama
# from flask import Flask, request, jsonify, render_template
# from flask_cors import CORS
# from langchain_experimental.data_anonymizer import PresidioAnonymizer
# from presidio_anonymizer.entities import OperatorConfig
# from langchain_core.prompts import PromptTemplate
# prompt_template = PromptTemplate(
# input_variables=["context", "question"],
# template="""
# You are an assistant that helps managers to inquire about medical analysis at their companies. The following context may contain sensitive information. Ensure that your response does not include any Personally Identifiable Information (PII) such as names, phone numbers, email addresses. The context that you receive is the medical data that was collected by the company.
# Context: {context}
# Query: {question}
# Provide a safe and anonymized response that answers the query while respecting privacy guidelines.
# """
# )
# anonymizer = PresidioAnonymizer(analyzed_fields=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS"],
# operators={
# "PERSON": OperatorConfig("redact", {}),
# "PHONE_NUMBER": OperatorConfig("redact", {}),
# "EMAIL_ADDRESS": OperatorConfig("redact", {}),
# })
# llm = Ollama(model="tinyllama", base_url="http://host.docker.internal:11434")
# app = Flask(__name__)
# CORS(app)
# # Function to anonymize PII from the extracted text
# def anonymize_pii(text):
# return anonymizer.anonymize(text)
# # Function to extract text from PDF
# def extract_text_from_pdf(pdf_path):
# with open(pdf_path, "rb") as file:
# reader = PyPDF2.PdfReader(file)
# text = ""
# for page in reader.pages:
# text += page.extract_text()
# return anonymize_pii(text)
# # Extracting text from all PDFs in a directory
# pdf_dir = "./data/"
# documents = []
# for filename in os.listdir(pdf_dir):
# if filename.endswith(".pdf"):
# text = extract_text_from_pdf(os.path.join(pdf_dir, filename))
# documents.append(Document(page_content=text))
# # Step 1: Create embeddings for the documents
# embedding_model = OllamaEmbeddings(model="tinyllama", base_url="http://host.docker.internal:11434")
# docsearch = FAISS.from_documents(documents, embedding_model)
# # Step 3: Create a RetrievalQA chain using the local LLM
# qa_chain = RetrievalQA.from_chain_type(
# llm=llm,
# chain_type="stuff",
# retriever=docsearch.as_retriever(),
# return_source_documents=True,
# chain_type_kwargs={"prompt": prompt_template}
# )
# # # Step 4: Define a query
# # query = "Show me any information about the medical data that was collected."
# # # Step 5: Run the query
# # result = qa_chain({ "query": query })
# # print(result)
# @app.route("/")
# def hello():
# return render_template("index.html")
# @app.route("/query", methods=["POST"])
# def query():
# data = request.json
# query = data.get("query")
# if not query:
# return jsonify({"error": "No query provided"}), 400
# result = qa_chain({"query": query})
# return jsonify({"result": result["result"]})
# if __name__ == "__main__":
# app.run(host="host.docker.internal", port=8906)
import re
import os
import PyPDF2
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_core.documents import Document
from langchain_community.llms import Ollama
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS
from langchain_experimental.data_anonymizer import PresidioAnonymizer
from presidio_anonymizer.entities import OperatorConfig
from langchain_core.prompts import PromptTemplate
prompt_template = PromptTemplate(
input_variables=["context", "question"],
template="""
You are an assistant that helps users with their medical analysis. The following context may contain sensitive information. Ensure that your response does not include any Personally Identifiable Information (PII) such as names, phone numbers, email addresses.
Context: {context}
Query: {question}
Provide a safe and anonymized response that answers the query while respecting privacy guidelines.
"""
)
anonymizer = PresidioAnonymizer(analyzed_fields=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS"],
operators={
"PERSON": OperatorConfig("redact", {}),
"PHONE_NUMBER": OperatorConfig("redact", {}),
"EMAIL_ADDRESS": OperatorConfig("redact", {}),
})
llm = Ollama(model="tinyllama", base_url="http://ollama-container:11434")
app = Flask(__name__)
CORS(app)
# Function to anonymize PII from the extracted text
def anonymize_pii(text):
return anonymizer.anonymize(text)
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return anonymize_pii(text)
# Extracting text from all PDFs in a directory
pdf_dir = "./data/"
documents = []
for filename in os.listdir(pdf_dir):
print(f"Processing {filename}")
if filename.endswith(".pdf"):
text = extract_text_from_pdf(os.path.join(pdf_dir, filename))
documents.append(Document(page_content=text))
# Step 1: Create embeddings for the documents
embedding_model = OllamaEmbeddings(model="tinyllama", base_url="http://ollama-container:11434")
docsearch = FAISS.from_documents(documents, embedding_model)
# Step 3: Create a RetrievalQA chain using the local LLM
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=docsearch.as_retriever(),
return_source_documents=True,
chain_type_kwargs={"prompt": prompt_template}
)
# # Step 4: Define a query
# query = "Show me any information about the medical data that was collected."
# # Step 5: Run the query
# result = qa_chain({ "query": query })
# print(result)
@app.route("/")
def hello():
return render_template("index.html")
@app.route("/query", methods=["POST"])
def query():
data = request.json
query = data.get("query")
if not query:
return jsonify({"error": "No query provided"}), 400
result = qa_chain({"query": query})
return jsonify({"result": result["result"]})
if __name__ == "__main__":
app.run(host="0.0.0.0", port=8906)