-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathf3.py
135 lines (103 loc) · 5.47 KB
/
f3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import pdfplumber
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer, CrossEncoder
import torch
import numpy as np
import faiss
# Ensure you have the punkt tokenizer downloaded
nltk.download('punkt')
# Step 1: Extract text from PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text += page.extract_text()
return text
# Step 2: Advanced Chunking (Paragraphs, Sentences, and Word-Level Chunks)
def advanced_chunking(text, sentence_chunk_size=5, word_chunk_size=50):
paragraphs = text.split("\n\n") # Split by paragraphs
sentence_chunks = []
word_chunks = []
# Sentence Chunking (sliding window)
sentences = sent_tokenize(text)
for i in range(0, len(sentences), sentence_chunk_size):
sentence_chunks.append(" ".join(sentences[i:i + sentence_chunk_size]))
# Word Chunking (sliding window)
words = word_tokenize(text)
for i in range(0, len(words), word_chunk_size):
word_chunks.append(" ".join(words[i:i + word_chunk_size]))
return paragraphs, sentence_chunks, word_chunks
# Step 3: Load Falcon Model and Embedding Models (SentenceTransformer for embeddings)
def load_models():
# Load Falcon 7B model for response generation
model_name = "tiiuae/falcon-7b-instruct"
tokenizer_falcon = AutoTokenizer.from_pretrained(model_name)
model_falcon = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
# Load the pre-trained sentence transformer model for embedding generation
embedder = SentenceTransformer('all-MiniLM-L6-v2')
# Load the cross-encoder for relevance ranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-T5-12-large')
return tokenizer_falcon, model_falcon, embedder, cross_encoder
# Step 4: Generate Embeddings for Chunks and Query
def generate_embeddings(texts, embedder):
embeddings = embedder.encode(texts, convert_to_tensor=True)
return embeddings
# Step 5: Find the Most Relevant Chunk (Semantic Search with FAISS)
def semantic_search(query_embedding, chunk_embeddings):
# Convert to numpy arrays for FAISS
query_embedding = query_embedding.cpu().detach().numpy()
chunk_embeddings = chunk_embeddings.cpu().detach().numpy()
# FAISS index for fast retrieval
index = faiss.IndexFlatL2(chunk_embeddings.shape[1]) # L2 distance for cosine similarity
index.add(chunk_embeddings) # Add chunk embeddings to FAISS index
# Perform search (k=1 for most relevant chunk)
D, I = index.search(query_embedding, k=1)
return I[0][0] # Index of the most relevant chunk
# Step 6: Generate Response using Falcon 7B
def generate_response_with_falcon(query, relevant_chunk, tokenizer_falcon, model_falcon):
prompt = f"""
You are an intelligent assistant tasked with answering factual questions based on a document.
The document provided to you contains relevant information, but you should only provide an answer
if the information is directly available in the document. If the document doesn't contain enough information
to answer the question, respond with 'I don't have enough information to answer that question.'
Question: {query}
Context: {relevant_chunk}
Your answer should be based strictly on the information provided in the context. Do not make any assumptions,
do not hallucinate any information, and only give factual, verifiable answers.
"""
# Prepare input for the Falcon model
inputs = tokenizer_falcon(prompt, return_tensors="pt").to(model_falcon.device)
# Generate output using the Falcon model
outputs = model_falcon.generate(inputs.input_ids, max_length=512, num_return_sequences=1)
response = tokenizer_falcon.decode(outputs[0], skip_special_tokens=True)
return response.strip()
# Step 7: Main Query Processing Function
def answer_query(pdf_path, query, sentence_chunk_size=5, word_chunk_size=50):
# Step 1: Extract and chunk the PDF content
pdf_text = extract_text_from_pdf(pdf_path)
paragraphs, sentence_chunks, word_chunks = advanced_chunking(pdf_text, sentence_chunk_size, word_chunk_size)
# Combine all chunks into a single list (to be embedded)
all_chunks = paragraphs + sentence_chunks + word_chunks
# Step 2: Load models (Falcon, Cross-Encoder, Embedding)
tokenizer_falcon, model_falcon, embedder, cross_encoder = load_models()
# Step 3: Generate embeddings for all chunks
chunk_embeddings = generate_embeddings(all_chunks, embedder)
# Step 4: Generate query embedding
query_embedding = generate_embeddings([query], embedder)
# Step 5: Perform semantic search to find the most relevant chunk
relevant_chunk_index = semantic_search(query_embedding, chunk_embeddings)
# Retrieve the most relevant chunk
relevant_chunk = all_chunks[relevant_chunk_index]
# Step 6: Generate a response using Falcon 7B with the relevant chunk as context
answer = generate_response_with_falcon(query, relevant_chunk, tokenizer_falcon, model_falcon)
return answer
# Example usage
if __name__ == "__main__":
# Input PDF path and query
pdf_path = "cj7v2ch1.pdf" # Change this to your PDF file path
query = "What is multithreading??" # Your query
# Get the answer from the PDF content
answer = answer_query(pdf_path, query)
print("Answer: ", answer)