f3.py

import pdfplumber
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer, CrossEncoder
import torch
import numpy as np
import faiss

# Ensure you have the punkt tokenizer downloaded
nltk.download('punkt')

# Step 1: Extract text from PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Step 2: Advanced Chunking (Paragraphs, Sentences, and Word-Level Chunks)
def advanced_chunking(text, sentence_chunk_size=5, word_chunk_size=50):
    paragraphs = text.split("\n\n")  # Split by paragraphs
    sentence_chunks = []
    word_chunks = []
    
    # Sentence Chunking (sliding window)
    sentences = sent_tokenize(text)
    for i in range(0, len(sentences), sentence_chunk_size):
        sentence_chunks.append(" ".join(sentences[i:i + sentence_chunk_size]))
    
    # Word Chunking (sliding window)
    words = word_tokenize(text)
    for i in range(0, len(words), word_chunk_size):
        word_chunks.append(" ".join(words[i:i + word_chunk_size]))
    
    return paragraphs, sentence_chunks, word_chunks

# Step 3: Load Falcon Model and Embedding Models (SentenceTransformer for embeddings)
def load_models():
    # Load Falcon 7B model for response generation
    model_name = "tiiuae/falcon-7b-instruct"
    tokenizer_falcon = AutoTokenizer.from_pretrained(model_name)
    model_falcon = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
    
    # Load the pre-trained sentence transformer model for embedding generation
    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Load the cross-encoder for relevance ranking
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-T5-12-large')
    
    return tokenizer_falcon, model_falcon, embedder, cross_encoder

# Step 4: Generate Embeddings for Chunks and Query
def generate_embeddings(texts, embedder):
    embeddings = embedder.encode(texts, convert_to_tensor=True)
    return embeddings

# Step 5: Find the Most Relevant Chunk (Semantic Search with FAISS)
def semantic_search(query_embedding, chunk_embeddings):
    # Convert to numpy arrays for FAISS
    query_embedding = query_embedding.cpu().detach().numpy()
    chunk_embeddings = chunk_embeddings.cpu().detach().numpy()

    # FAISS index for fast retrieval
    index = faiss.IndexFlatL2(chunk_embeddings.shape[1])  # L2 distance for cosine similarity
    index.add(chunk_embeddings)  # Add chunk embeddings to FAISS index

    # Perform search (k=1 for most relevant chunk)
    D, I = index.search(query_embedding, k=1)
    return I[0][0]  # Index of the most relevant chunk

# Step 6: Generate Response using Falcon 7B
def generate_response_with_falcon(query, relevant_chunk, tokenizer_falcon, model_falcon):
    prompt = f"""
    You are an intelligent assistant tasked with answering factual questions based on a document. 
    The document provided to you contains relevant information, but you should only provide an answer 
    if the information is directly available in the document. If the document doesn't contain enough information 
    to answer the question, respond with 'I don't have enough information to answer that question.'

    Question: {query}
    Context: {relevant_chunk}
    
    Your answer should be based strictly on the information provided in the context. Do not make any assumptions, 
    do not hallucinate any information, and only give factual, verifiable answers.
    """
    
    # Prepare input for the Falcon model
    inputs = tokenizer_falcon(prompt, return_tensors="pt").to(model_falcon.device)
    
    # Generate output using the Falcon model
    outputs = model_falcon.generate(inputs.input_ids, max_length=512, num_return_sequences=1)
    response = tokenizer_falcon.decode(outputs[0], skip_special_tokens=True)
    
    return response.strip()

# Step 7: Main Query Processing Function
def answer_query(pdf_path, query, sentence_chunk_size=5, word_chunk_size=50):
    # Step 1: Extract and chunk the PDF content
    pdf_text = extract_text_from_pdf(pdf_path)
    paragraphs, sentence_chunks, word_chunks = advanced_chunking(pdf_text, sentence_chunk_size, word_chunk_size)
    
    # Combine all chunks into a single list (to be embedded)
    all_chunks = paragraphs + sentence_chunks + word_chunks
    
    # Step 2: Load models (Falcon, Cross-Encoder, Embedding)
    tokenizer_falcon, model_falcon, embedder, cross_encoder = load_models()

    # Step 3: Generate embeddings for all chunks
    chunk_embeddings = generate_embeddings(all_chunks, embedder)

    # Step 4: Generate query embedding
    query_embedding = generate_embeddings([query], embedder)

    # Step 5: Perform semantic search to find the most relevant chunk
    relevant_chunk_index = semantic_search(query_embedding, chunk_embeddings)

    # Retrieve the most relevant chunk
    relevant_chunk = all_chunks[relevant_chunk_index]

    # Step 6: Generate a response using Falcon 7B with the relevant chunk as context
    answer = generate_response_with_falcon(query, relevant_chunk, tokenizer_falcon, model_falcon)
    
    return answer

# Example usage
if __name__ == "__main__":
    # Input PDF path and query
    pdf_path = "cj7v2ch1.pdf"  # Change this to your PDF file path
    query = "What is multithreading??"  # Your query
    
    # Get the answer from the PDF content
    answer = answer_query(pdf_path, query)
    
    print("Answer: ", answer)