f2.py

import pdfplumber
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import CrossEncoder
import torch

# Ensure you have the punkt tokenizer downloaded
nltk.download('punkt')

# Step 1: Extract text from PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Step 2: Chunk the text into smaller parts for efficient querying
def chunk_text(text, chunk_size=500):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk + sentence) > chunk_size:
            chunks.append(current_chunk)
            current_chunk = sentence
        else:
            current_chunk += " " + sentence
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

# Step 3: Initialize Falcon 7B Instruct Model for answer generation
def load_falcon_model():
    model_name = "tiiuae/falcon-7b-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
    return tokenizer, model

# Step 4: Generate a response using Falcon 7B with a fact-based prompt
def generate_response_with_facts(query, relevant_chunk, tokenizer, model):
    prompt = f"""
    You are an intelligent assistant tasked with answering factual questions based on a document. 
    The document provided to you contains relevant information, but you should only provide an answer 
    if the information is directly available in the document. If the document doesn't contain enough information 
    to answer the question, respond with 'I don't have enough information to answer that question.'

    Question: {query}
    Context: {relevant_chunk}
    
    Your answer should be based strictly on the information provided in the context. Do not make any assumptions, 
    do not hallucinate any information, and only give factual, verifiable answers.
    """
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(inputs.input_ids, max_length=512, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Clean up the response to ensure no hallucination occurs
    if "I don't have enough information" in response:
        return response.strip()  # Return the model's exact response if it doesn't know
    else:
        return response.strip()

# Step 5: Initialize CrossEncoder for ranking relevant chunks
def load_cross_encoder():
    cross_encoder = CrossEncoder('sentence-transformers/msmarco-distilbert-base-v3')
    return cross_encoder

# Step 6: Get the most relevant chunk using CrossEncoder
def get_most_relevant_chunk(query, chunks, cross_encoder):
    scores = [cross_encoder.predict([query, chunk]) for chunk in chunks]
    most_relevant_idx = scores.index(max(scores))  # Index of most relevant chunk
    return chunks[most_relevant_idx]

# Main function to process the query
def answer_query(pdf_path, query, chunk_size=500):
    # Load the models
    tokenizer, model = load_falcon_model()
    cross_encoder = load_cross_encoder()
    
    # Step 1: Extract text from the PDF
    pdf_text = extract_text_from_pdf(pdf_path)
    
    # Step 2: Chunk the text into smaller parts
    chunks = chunk_text(pdf_text, chunk_size)
    
    # Step 3: Find the most relevant chunk using the CrossEncoder
    relevant_chunk = get_most_relevant_chunk(query, chunks, cross_encoder)
    
    # Step 4: Generate an answer using Falcon 7B with the relevant chunk as context
    response = generate_response_with_facts(query, relevant_chunk, tokenizer, model)
    
    return response

if __name__ == "__main__":
    # Input PDF path and query
    pdf_path = "cj7v2ch1.pdf"  # Change this to your PDF file path
    query = "What is multithreading??"  # Your query
    
    # Get the answer from the PDF content
    answer = answer_query(pdf_path, query)
    
    print("Answer: ", answer)