-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathResearchGPT.py
419 lines (397 loc) · 21.7 KB
/
ResearchGPT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
import re
import os
import requests
import json
import openai
from bs4 import BeautifulSoup
import time
import random
from googleapiclient.discovery import build
import io
from PyPDF2 import PdfReader
import markdown
from docx import Document
from docx.shared import Pt
from docx.oxml.ns import nsdecls
from docx.oxml import parse_xml
from html2text import html2text
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
openai.api_key="OPENAI_API_KEY"
def search_web(query, num_results=9):
global skip
search_results = []
service = build("customsearch", "v1", developerKey="GOOGLE_API_DEVELOPER_KEY")
res = service.cse().list(q=query, cx='GOOGLE_SEARCH_API_ID', num=num_results).execute()
try:
search_results = [item['link'] for item in res['items']]
skip = False
except:
skip = True
search_results = []
while True:
newlink = input("Search failed, enter links one by one manually and write DONE when done: ")
if newlink == "DONE":
break
else:
search_results.append(newlink)
return search_results
def markdown_to_word(markdown_string, file_name):
# Split the markdown text into paragraphs
paragraphs = markdown_string.split('\n')
# Create a new Word document
doc = Document()
# Iterate through each paragraph in the markdown text
for para in paragraphs:
if para.startswith("# "):
# This is a heading 1
paragraph = doc.add_paragraph()
run = paragraph.add_run(para[2:])
run.bold = True
run.font.size = Pt(12)
paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
elif para.startswith("## "):
# This is a heading 2
paragraph = doc.add_paragraph()
run = paragraph.add_run(para[3:])
run.bold = True
run.font.size = Pt(11)
elif para.startswith("### "):
# This is a heading 2
paragraph = doc.add_paragraph()
run = paragraph.add_run(para[4:])
run.underline = True
run.font.size = Pt(11)
else:
# This is a normal paragraph
paragraph = doc.add_paragraph()
# Use a regular expression to find and replace all instances of [^x^]
superscript_pattern = re.compile(r'\[\^(\d+)\^\]')
start = 0
for match in superscript_pattern.finditer(para):
# Add the text before the superscript as normal text
run = paragraph.add_run(para[start:match.start()])
# Add the superscript text
run = paragraph.add_run(match.group(1))
run.font.superscript = True
start = match.end()
# Add any remaining text after the last superscript
run = paragraph.add_run(para[start:])
# Save the document
doc.save(file_name)
# Open the document using the default application
os.system(f'start {file_name}')
def extract_text_from_link(link: str):
if link.endswith(('.htm')):
page = requests.get(link)
time.sleep(1)
soup = BeautifulSoup(page.content, 'html.parser')
all_b_tags = soup.find_all('b')
return '\n'.join([b.get_text() for b in all_b_tags])
elif link.endswith('.pdf'):
response = requests.get(link)
time.sleep(1)
pdf_file = io.BytesIO(response.content)
pdf_reader = PdfReader(pdf_file)
text = ''
for page in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page].extract_text()
return text
else:
page = requests.get(link)
time.sleep(1)
soup = BeautifulSoup(page.content, 'html.parser')
all_p_tags = soup.find_all('p')
return '\n'.join([p.get_text() for p in all_p_tags])
def passage_segmenter(passage):
segment = []
count = 0
while count < len(passage):
segment.append(passage[count:count + 11900])
count += 119
00
return segment
def ask_question(messages):
response = openai.ChatCompletion.create(
model="gpt-4-0613",
messages=messages,
stream=True
)
output = ""
print("thinking...")
for chunk in response:
if "delta" in chunk["choices"][0]:
delta = chunk["choices"][0]["delta"]
if "content" in delta:
content = delta["content"]
output += content
print(content, end="")
return output
def order_links(query, links_str):
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a helpful, pattern-following assistant."},
{"role": "user", "content": f"Output in the format of a python list the order in which of following links are most likely to best answer the query, {query}\nExample output formatting (order is random. Use as a formatting example only. You are only allowed to re-oder the list, you are not allowed to remove links): [4,7,2,1,5,6,8,9,3]\n (answer with list of ints only)? Links: " + links_str}
]
)
output_str = response["choices"][0]["message"]["content"].lower().replace(" ", "")
# Process the output string and convert it into a list of integers
output_list = [int(x) for x in output_str.strip('[]').split(',') if x.strip().isdigit()]
return output_list
def name_file(output):
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": """You are WordDocumentNameGeneratorGPT. You are given a research report and you come up with a name for it based on what it is about and based on the following word file naming conventions: Avoid backslashes in actual file or directory names as they serve as separators. Volume names may require backslashes, like in "C: or "server\share" in UNC names. Don't count on case sensitivity; treat OSCAR, Oscar, oscar as identical, although POSIX systems may differ. NTFS has POSIX case sensitivity support, but it's non-default. Unicode, and characters from the extent 128-255 are allowed barring reserved characters (like <, >, :, ", /, |, ?, *, and ASCII NUL) and integer values 1-31. Reserved file names include CON, PRN, AUX, NUL, COM(0-9), and LPT(0-9) including any extensions (e.g., NUL.txt). Also, avoid ending names with a space or a period. A period can represent the current directory (.\temp.txt) or the preceding directory (..\temp.txt). Note: Your entire output will be treated as the input for the file name so do not output anything you do not intend to be placed directly into the file name."""},
{"role": "user", "content": f"Come up with a file name for the following research report (output file name only): {output}"}
]
)
output_str = response["choices"][0]["message"]["content"]
return output_str
def summarize(query, res, link, add_note):
if add_note != "":
note = "The following is a critique of the summary you wrote on a text snippet from the same article that preceeded the one you are receiving now. USe this to guide your summary: " + add_note +" Only summarize information from this text snippet if it is related to what is specified here. Otherwise set is_relevant to null"
print("\n\nNote: " + note + "\n\n")
res = note +"\nText: "+ res
else:
res = "Text: "+ res
response = openai.ChatCompletion.create(
model="gpt-4-0613",
messages=[
{"role": "system", "content": """You are a helpful, pattern-following assistant. You are given some text retrieved from a website and a research query and you generate a very detailed and comprehensive summary of only the parts of the text relevant and useful to the answering the research query. Include as much detail as is physically possible. You only answer using the following JSON format and strictly follow JSON formatting conventions:
{
"is_relevant" : boolean, #true if the provided text provides information relevant to answering the users query, false if the text irrelevant to the query or just discusses access denial to a webpage
"summary": "string" #very detailed (but not wordy) summary of the key information in the text if is_relevant is true (squeeze as much info into as few characters as you can. If you'd like, you can even use some shorthand). null if is_relevant is false
}
NOTE: It cannot be stressed enough how important it is that you do not break JSON formatting conventions. If you do, it will cause a JSONDecodeError. Avoid Invalid control characters. Only output one JSON summary"""},
{"role": "user", "content": f"Summarize the key information in the following text in significant detail, which was scraped from the website {link}, that are relevant to answering the question, {query}." + res}
],
stream = True
)
output = ""
for chunk in response:
if "delta" in chunk["choices"][0]:
delta = chunk["choices"][0]["delta"]
if "content" in delta:
content = delta["content"]
if "}" in content:
output += content
print(content, end="")
break
else:
output += content
print(content, end="")
return output.strip()
def check_source(query_links, search_info, topic_summary):
print("\n")
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo-16k",
messages=[
{"role": "system", "content": """You are a helpful, pattern-following assistant. You will be given a summary of the text from a website and a users question and you will decide whether the summary generated provides enough information to answer the question or if the information gathered is inadequate and more is needed. But you also make sure not boil the ocean or get stuck in some infinite task loop. Instead, you're aware that there is no such thing as the perfect answer and that your goal is not to optimize towards "all-encompassing" but "just enought". You only respond in the following JSON format:
{
"continue" : boolean, #false if the provided summary provides ample information to answer the users question, true if more information is needed.
"reason" : "string", # A one sentence description of exactly what information is missing that is needed to answer the question if continue is true. Also describe in detail what is needed to answer the question and what information is extra. null is continue is false
"skip": boolean # If it seems that the content being summarized is not a great source for answering the question, then set "skip" to true. If the text is relevant and more of it should be summarized, set to false
}
*Note: only set continue to true if key information is needed, otherwise set this value to false. If the question can be answered using the information gathered, then continue should be false. All thats needed is a few sentences that do answer the question.
"""},
{"role": "user", "content": f"Does the following text provide ample information to answer the question, {search_info} Text: " + topic_summary +"\nLinks: " + str(query_links)}
],
temperature=0,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
stream = True
)
output = ""
for chunk in response:
if "delta" in chunk["choices"][0]:
delta = chunk["choices"][0]["delta"]
if "content" in delta:
content = delta["content"]
output += content
print(content, end="")
return output
def start_research(query):
messages=[
{"role": "system", "content": """
You are an AI research assistant that only responds in JSON. You have been shown to be capable of completing
complex research tasks that require retriving information on several different topics at a superhuman level.
The following is the JSON format of all your outputs
{
"number_of_searches" : int #An integer value representing the number individual google searches needed,
"search_queries" : {
"1" : "string", #first search query,
"2" : "string", #second search query
...
"n" : "string" #nth query (Max queries: 8, Min queries: 4)
}
"search_query_goals" : {
"1" : "string", #Describe what information that needs to be obtained with query one
"2" : "string", #Describe what information that needs to be obtained with query two
...
"n" : "string" #Describe what information that needs to be obtained with query n (Max queries: 5)
}
}
search_query_goal should always be phrased as a question
"""}
]
messages.append({"role": "user", "content": query})
# Create a dictionary to hold search queries
json_dict = {
"search_queries": {}
}
response = ask_question(messages)
json_data = json.loads(response)
num_searches = int(json_data['number_of_searches'])
return json_dict, json_data, num_searches
# Function to gather web information based on a search query
def get_web_info(json_dict, json_data, i, read_links):
# Create a new key for the dictionary based on the index of the topic
new_key = f"topic_{str(i)}"
# Initialize an empty list to store search queries for the new topic
json_dict["search_queries"][new_key] = []
# Get the search query and goal from the input data
search_query = json_data['search_queries'][str(i)]
search_info = json_data['search_query_goals'][str(i)]
# Print a status update
print("\nRetrieving links for topic " + str(i) + "...\n")
# Perform a web search based on the search query and store the result links
query_links = search_web(search_query)
# Remove any links that have already been read
for link in read_links:
if link in query_links:
query_links.remove(link)
# Print the retrieved links
print("\nLinks retrieved:")
for link_count, link in enumerate(query_links, 1):
print(f"Link #{link_count}: {link}")
# Order the links based on their relevancy to the search query
print("\nOrdering links by relevancy...")
ordered_links = order_links(search_info, str(query_links))
print(f"Order chosen: {ordered_links}")
# Return the search query, search goal, query links, ordered links, updated dictionary, and the new key
return search_query, search_info, query_links, ordered_links, json_dict, new_key
# Function to create summaries from the information gathered from the web
def create_summaries(search_query, search_info, query_links, ordered_links, json_dict, json_data, new_key, i):
# Initialize a dictionary to store whether to continue searching
link_verdict_json = {
"continue": True
}
# Initialize a link counter
link_count = 1
# Initialize a summary counter
summary_count = 0
# Initialize flags for whether all links have been processed and whether a new search has been generated
all_links_processed = False
new_search_generated = False
# Continue to process links as long as there are links left and the 'continue' flag is set
while link_verdict_json['continue'] and not all_links_processed and summary_count <= 5:
# For each ordered link...
for ordered_link in ordered_links:
# Generate a new key for the link
link_key = f"link_{str(link_count)}"
# Get the link and its text content
current_link = query_links[ordered_link - 1]
current_link_text = extract_text_from_link(current_link).strip()
print("\nSummarizing text...\n")
print("Link: " + str(current_link) + "\n")
# Segment the text content into manageable chunks
segments = passage_segmenter(current_link_text)
# For each segment, generate a summary and check if the source is relevant
reason = ""
for_verdict = ""
for segment in segments:
link_summary = summarize(search_info, segment, current_link, reason)
summary_json = json.loads(link_summary)
# If the source is relevant, add it to the dictionary and check if more information is needed
if summary_json['is_relevant']:
for_verdict += summary_json['summary']
read_links.append(current_link)
new_value_dict = {
f"{link_key}": current_link,
f"{link_key}_summary": summary_json['summary']
}
json_dict["search_queries"][new_key].append(new_value_dict)
link_verdict_json = json.loads(check_source(query_links, search_info, for_verdict))
# Increment the summary counter
summary_count += 1
# Break the loop if no more information is needed or if the length limit has been reached
if not link_verdict_json['continue'] or len(link_summary) >= 6000:
reason = ""
for_verdict = ""
break
elif link_verdict_json['skip'] is True:
break
else:
reason = link_verdict_json['reason']
else:
break
print("\n")
# Increment the link counter
link_count += 1
# Set the 'all_links_processed' flag if all links have been processed
print("\n\nSummary count: " + str(summary_count) + "\n\n")
if link_count > len(ordered_links):
all_links_processed = True
if summary_count >= 5:
break
# Break the loop if no more information is needed, if the link limit has been reached, or if all links have been processed
if not link_verdict_json['continue'] or link_count > 9 or all_links_processed or summary_count >= 5:
break
if not link_verdict_json['continue'] or link_count > 9 or all_links_processed or summary_count >= 5:
break
# Print the updated dictionary
storage = json.dumps(json_dict, indent=2)
print("\n")
print(storage)
# Return the updated dictionary and the 'new_search_generated' flag
return json_dict, new_search_generated
# Function to generate an answer from the gathered and summarized information
def generate_answer(query, json_dict):
# Initialize the list of messages for the answer
answer_messages=[
{"role": "system", "content": """
You are a research chatbot. You will be provided with a research task from the user as well as a bunch of information that was just scraped from the web and your job is to use that information to generate a very detailed and comprehensive research report with evidence-based explanations for every argument. Your reports should be comparable in length to professional industry research reports like ones published by Nielsens or think tanks like Brookings Institute. Your outputs should never be less than 3000 words in length but you should always aim for 5200 words. You will always cite your work by using footnotes and your output will always be in markdown syntax.
Follow this formatting:
# Main title
## Sub Titles
[^n^] (where n is a number) for footnotes
"""}
]
# Generate the prompt for the answer
answer_prompt = "User generated research question: " + query + "\n\nInformation:\n"
sorted_keys = sorted(json_dict["search_queries"].keys())
# For each topic, add its information to the prompt
for topic in sorted_keys:
answer_prompt += f"\n- Topic: {topic}\n"
for topic_info_dict in json_dict["search_queries"][topic]:
for link_key, link in topic_info_dict.items():
# Only add the link to the prompt if it does not contain a summary
if "_summary" not in link_key:
answer_prompt += f" - {link_key}: {link}\n"
summary_key = f"{link_key}_summary"
# If a summary exists for the link, add it to the prompt
if summary_key in topic_info_dict:
summary = topic_info_dict[summary_key]
answer_prompt += f" - {summary_key}: {summary}\n"
# Print the prompt and add it to the messages
print(answer_prompt)
answer_messages.append({"role": "user", "content": answer_prompt})
# Generate the answer based on the messages
answer = ask_question(answer_messages)
answer = answer.replace("\n\n", "\n")
# Return the answer
return answer
def main():
query = input("Query: ")
global read_links
read_links = []
json_dict, json_data, num_searches = start_research(query)
for i in range(1, num_searches + 1):
search_query, search_info, query_links, ordered_links, json_dict, new_key = get_web_info(json_dict, json_data, i, read_links)
json_dict, new_search_generated = create_summaries(search_query, search_info, query_links, ordered_links, json_dict, json_data, new_key, i)
output = generate_answer(query, json_dict)
file_name = name_file(output).replace(" ", "_")
markdown_to_word(output, file_name)
main()