p3.py

import streamlit as st
import pdfplumber
import spacy
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util

# Load models
ner_model = spacy.load("en_core_web_trf")  # Transformer-based NER for better accuracy
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # For context similarity

# Intent recognition model (DistilBERT fine-tuned)
intent_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
intent_model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilbert-base-uncased-finetuned-intent")
intent_pipeline = pipeline("text-classification", model=intent_model, tokenizer=intent_tokenizer)

# Map intents to entity types
INTENT_TO_ENTITY = {
    "author": "PERSON",
    "date": "DATE",
    "title": "WORK_OF_ART",
    "location": "GPE"
}

# Process a single PDF and extract text
def process_pdf(file):
    with pdfplumber.open(file) as pdf:
        text = [page.extract_text() for page in pdf.pages]
    return "\n".join(text)

# Fetch context based on question
def fetch_relevant_context(question, text_segments):
    question_embedding = embedding_model.encode(question, convert_to_tensor=True)
    segment_embeddings = embedding_model.encode(text_segments, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(question_embedding, segment_embeddings)
    top_index = scores.argmax().item()
    return text_segments[top_index]

# Predict intent using the intent recognition model
def predict_intent(question):
    prediction = intent_pipeline(question)
    return prediction[0]["label"]  # Predicted intent

# Extract relevant entities from the context based on intent
def extract_entities(context, intent):
    entity_type = INTENT_TO_ENTITY.get(intent)
    if not entity_type:
        return "No relevant entity type for this intent."
    
    doc = ner_model(context)
    entities = [ent.text for ent in doc.ents if ent.label_ == entity_type]
    return entities[0] if entities else "No relevant entities found."

# Streamlit Sidebar for File Upload
st.sidebar.title("📂 Upload PDF")
uploaded_file = st.sidebar.file_uploader("Upload a PDF", type="pdf")

if uploaded_file:
    # Step 1: Process the uploaded PDF
    raw_text = process_pdf(uploaded_file)
    text_segments = raw_text.split("\n\n")  # Split into segments for context retrieval
    st.sidebar.success("PDF uploaded and processed successfully!")
    
    # Main App Content
    st.title("🤖 NLP-Enhanced Chatbot")
    st.write("Ask factual questions about the uploaded PDF content.")
    
    # Step 2: Ask a question
    question = st.text_input("Ask a question (e.g., 'Who is the author?', 'When was it published?'):")
    if question:
        # Step 3: Predict intent using NLP
        intent = predict_intent(question)
        st.write(f"**Predicted Intent:** {intent}")
        
        # Step 4: Fetch relevant context
        context = fetch_relevant_context(question, text_segments)
        st.write(f"**Relevant Context:** {context}")
        
        # Step 5: Extract entities and answer
        answer = extract_entities(context, intent)
        st.write(f"**Answer:** {answer}")
else:
    st.warning("Please upload a PDF to get started.")