-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathf2.py
107 lines (85 loc) · 4.06 KB
/
f2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import pdfplumber
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import CrossEncoder
import torch
# Ensure you have the punkt tokenizer downloaded
nltk.download('punkt')
# Step 1: Extract text from PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text += page.extract_text()
return text
# Step 2: Chunk the text into smaller parts for efficient querying
def chunk_text(text, chunk_size=500):
sentences = sent_tokenize(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk + sentence) > chunk_size:
chunks.append(current_chunk)
current_chunk = sentence
else:
current_chunk += " " + sentence
if current_chunk:
chunks.append(current_chunk)
return chunks
# Step 3: Initialize Falcon 7B Instruct Model for answer generation
def load_falcon_model():
model_name = "tiiuae/falcon-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
return tokenizer, model
# Step 4: Generate a response using Falcon 7B with a fact-based prompt
def generate_response_with_facts(query, relevant_chunk, tokenizer, model):
prompt = f"""
You are an intelligent assistant tasked with answering factual questions based on a document.
The document provided to you contains relevant information, but you should only provide an answer
if the information is directly available in the document. If the document doesn't contain enough information
to answer the question, respond with 'I don't have enough information to answer that question.'
Question: {query}
Context: {relevant_chunk}
Your answer should be based strictly on the information provided in the context. Do not make any assumptions,
do not hallucinate any information, and only give factual, verifiable answers.
"""
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(inputs.input_ids, max_length=512, num_return_sequences=1)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Clean up the response to ensure no hallucination occurs
if "I don't have enough information" in response:
return response.strip() # Return the model's exact response if it doesn't know
else:
return response.strip()
# Step 5: Initialize CrossEncoder for ranking relevant chunks
def load_cross_encoder():
cross_encoder = CrossEncoder('sentence-transformers/msmarco-distilbert-base-v3')
return cross_encoder
# Step 6: Get the most relevant chunk using CrossEncoder
def get_most_relevant_chunk(query, chunks, cross_encoder):
scores = [cross_encoder.predict([query, chunk]) for chunk in chunks]
most_relevant_idx = scores.index(max(scores)) # Index of most relevant chunk
return chunks[most_relevant_idx]
# Main function to process the query
def answer_query(pdf_path, query, chunk_size=500):
# Load the models
tokenizer, model = load_falcon_model()
cross_encoder = load_cross_encoder()
# Step 1: Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_path)
# Step 2: Chunk the text into smaller parts
chunks = chunk_text(pdf_text, chunk_size)
# Step 3: Find the most relevant chunk using the CrossEncoder
relevant_chunk = get_most_relevant_chunk(query, chunks, cross_encoder)
# Step 4: Generate an answer using Falcon 7B with the relevant chunk as context
response = generate_response_with_facts(query, relevant_chunk, tokenizer, model)
return response
if __name__ == "__main__":
# Input PDF path and query
pdf_path = "cj7v2ch1.pdf" # Change this to your PDF file path
query = "What is multithreading??" # Your query
# Get the answer from the PDF content
answer = answer_query(pdf_path, query)
print("Answer: ", answer)