-
Notifications
You must be signed in to change notification settings - Fork 0
/
p3.py
82 lines (68 loc) · 3.27 KB
/
p3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import streamlit as st
import pdfplumber
import spacy
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
# Load models
ner_model = spacy.load("en_core_web_trf") # Transformer-based NER for better accuracy
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # For context similarity
# Intent recognition model (DistilBERT fine-tuned)
intent_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
intent_model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilbert-base-uncased-finetuned-intent")
intent_pipeline = pipeline("text-classification", model=intent_model, tokenizer=intent_tokenizer)
# Map intents to entity types
INTENT_TO_ENTITY = {
"author": "PERSON",
"date": "DATE",
"title": "WORK_OF_ART",
"location": "GPE"
}
# Process a single PDF and extract text
def process_pdf(file):
with pdfplumber.open(file) as pdf:
text = [page.extract_text() for page in pdf.pages]
return "\n".join(text)
# Fetch context based on question
def fetch_relevant_context(question, text_segments):
question_embedding = embedding_model.encode(question, convert_to_tensor=True)
segment_embeddings = embedding_model.encode(text_segments, convert_to_tensor=True)
scores = util.pytorch_cos_sim(question_embedding, segment_embeddings)
top_index = scores.argmax().item()
return text_segments[top_index]
# Predict intent using the intent recognition model
def predict_intent(question):
prediction = intent_pipeline(question)
return prediction[0]["label"] # Predicted intent
# Extract relevant entities from the context based on intent
def extract_entities(context, intent):
entity_type = INTENT_TO_ENTITY.get(intent)
if not entity_type:
return "No relevant entity type for this intent."
doc = ner_model(context)
entities = [ent.text for ent in doc.ents if ent.label_ == entity_type]
return entities[0] if entities else "No relevant entities found."
# Streamlit Sidebar for File Upload
st.sidebar.title("📂 Upload PDF")
uploaded_file = st.sidebar.file_uploader("Upload a PDF", type="pdf")
if uploaded_file:
# Step 1: Process the uploaded PDF
raw_text = process_pdf(uploaded_file)
text_segments = raw_text.split("\n\n") # Split into segments for context retrieval
st.sidebar.success("PDF uploaded and processed successfully!")
# Main App Content
st.title("🤖 NLP-Enhanced Chatbot")
st.write("Ask factual questions about the uploaded PDF content.")
# Step 2: Ask a question
question = st.text_input("Ask a question (e.g., 'Who is the author?', 'When was it published?'):")
if question:
# Step 3: Predict intent using NLP
intent = predict_intent(question)
st.write(f"**Predicted Intent:** {intent}")
# Step 4: Fetch relevant context
context = fetch_relevant_context(question, text_segments)
st.write(f"**Relevant Context:** {context}")
# Step 5: Extract entities and answer
answer = extract_entities(context, intent)
st.write(f"**Answer:** {answer}")
else:
st.warning("Please upload a PDF to get started.")