-
Notifications
You must be signed in to change notification settings - Fork 1
/
prompt.py
340 lines (222 loc) · 14.4 KB
/
prompt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
import os
import openai
import re
import json
import numpy as np
from tqdm import tqdm
def fill_in_convert_to_statement_template(character, passage):
return f'''Given the passage about "{character}":
{passage}
Please generate some important persona statements about "{character}" for a role-playing AI to follow. Each statement should be formalized as a sentence that exactly contains "{character}" and avoids coreference.'''
def convert_to_statement(character, model_engine):
if os.path.exists(f"statement/{character}.json"):
return json.load(open(f"statement/{character}.json"))
icl_character = "Beethoven"
icl_passage = '''Ludwig van Beethoven[n 1] (baptised 17 December 1770 – 26 March 1827) was a German composer and pianist. He is one of the most revered figures in the history of Western music; his works rank among the most performed of the classical music repertoire and span the transition from the Classical period to the Romantic era in classical music. Beethoven's career has conventionally been divided into early, middle, and late periods. His early period, during which he forged his craft, is typically considered to have lasted until 1802. From 1802 to around 1812, his middle period showed an individual development from the styles of Joseph Haydn and Wolfgang Amadeus Mozart, and is sometimes characterized as heroic. During this time, he began to grow increasingly deaf. In his late period, from 1812 to 1827, he extended his innovations in musical form and expression.'''
icl_output = '''- Beethoven was a German composer and pianist born on 17 December 1770.
- Beethoven's works are highly celebrated in the Western music history, spanning the transition from the Classical period to the Romantic era.
- Beethoven's career is often segmented into early, middle, and late periods by music historians.
- The early period of Beethoven's career, up until 1802, involved him honing his musical talents.
- During the middle period, from 1802 to around 1812, Beethoven developed a distinct style that diverged from Joseph Haydn and Wolfgang Amadeus Mozart.
- This middle period of Beethoven's career is sometimes labeled as "heroic."
- Beethoven began to experience significant hearing loss during his middle period.
- Beethoven's late period, from 1812 until his death on 26 March 1827, featured further innovation in musical form and expression.'''
icl = [
{"role": "user", "content": fill_in_convert_to_statement_template(icl_character, icl_passage)},
{"role": "system", "content": icl_output},
]
passages = open(f"wiki/wiki_{character}.txt").read().split("\n\n")
dataset = []
bar = tqdm(passages)
for passage in bar:
try:
statements = openai.ChatCompletion.create(
model=model_engine,
temperature=0.0,
messages=icl+[
{"role": "user", "content": fill_in_convert_to_statement_template(character, passage)},
],
).choices[0]['message']["content"]
for statement in statements.split("\n"):
if statement.startswith("- "):
dataset.append({"character": character, "passage": passage, "statement": statement[2:]})
bar.set_description(f"Converting the Document to Persona Statements... Number of Statements: {len(dataset)}")
except:
pass
json.dump(dataset, open(f"statement/{character}.json", "w"))
return dataset
def fill_in_relevant_query_generation_template(character, statement):
return f'''Persona Statement: {statement}
What utterance from the human user to an AI character role-playing as {character} has to be responded by including the information in the persona statement above?
Provide 3 diverse and concise possible utterances which view the AI as {character} and do not include the name inside the utterance.'''
def build_relevant_query_dataset(character, persona_statement_dataset, model_engine):
if os.path.exists(f"statement/{character}.query_relevant_to_statement.json"):
return json.load(open(f"statement/{character}.query_relevant_to_statement.json"))
system_prompt = "You are helpful agent to build AI characters, your job is to generate possible user utterances to AI characters."
dataset = []
bar = tqdm(persona_statement_dataset)
n_query = 0
for data in bar:
try:
statement = data["statement"]
response = openai.ChatCompletion.create(
model=model_engine,
temperature=1.0,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": fill_in_relevant_query_generation_template(character, statement)},
],
).choices[0]['message']["content"]
queries = re.findall(r"\"(.*)?\"", response)
data = {"statement": statement, "queries": queries}
n_query += len(queries)
dataset.append(data)
bar.set_description(f"Generating Relevant Queries... Number of Queries: {n_query}")
except:
pass
json.dump(dataset, open(f"statement/{character}.query_relevant_to_statement.json", "w"))
return dataset
def fill_in_query_discrimination_template(character, statement, query):
return f'''Character: {character}
Persona Statement: {statement}
User Utterance: {query}
Does this user utterance should be responded by including the information in the given persona statement? Only answer "yes" or "no" without any explanation.'''
def build_statement_query_relevance_dataset(character, relevant_query_dataset, model_engine):
if os.path.exists(f"statement/{character}.relevance.json"):
return json.load(open(f"statement/{character}.relevance.json"))
system_prompt = "You are a helpful agent to build AI characters, your job is to determine whether an utterance from the human user to a role-playing AI should be responded by including the information in the given persona statement or not."
dataset = relevant_query_dataset
new_dataset = []
bar = tqdm(dataset)
for data in bar:
statement = data["statement"]
_dataset = [_data for _data in dataset if _data != data]
for query in data["queries"]:
new_data = {"character": character, "statement": statement, "query": query, "relevant": "yes"}
new_dataset.append(new_data)
for _data in np.random.choice(_dataset, min(len(_dataset), 5), replace=False):
try:
query = np.random.choice(_data["queries"])
prompt = fill_in_query_discrimination_template(character, statement, query)
relevant = openai.ChatCompletion.create(
model=model_engine,
temperature=0.0,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
).choices[0]['message']["content"].lower()
if relevant in ["no", "yes"]:
new_data = {"character": character, "statement": statement, "query": query, "relevant": relevant}
new_dataset.append(new_data)
except:
pass
bar.set_description(f"Discriminating Queries... Number of Queries: {len(new_dataset)}")
json.dump(new_dataset, open(f"statement/{character}.relevance.json", "w"))
return new_dataset
def fill_in_nli_generation_template(character, statement, query):
return f'''Character: {character}
Persona Statement: {statement}
User Utterance: {query}
1. What are some common attributes among the responses to the user utterance no matter whether they are correct or incorrect according to the persona statement? Use these attributes to write the following responses.
2. Write a possible response to this utterance that the given persona statement is entailed to it in natural language inference, indicating the response correctly follows the information in the persona statement.
3. Write a possible response to this utterance that the given persona statement is neutral to it in natural language inference, indicating the response might be correct but lacks the information in the persona statement.
4. Write a possible response to this utterance that the given persona statement is contradicted to it in natural language inference, indicating the response might be partially correct but contains partial hallucination according to the persona statement.
''' + '''5. Formalize the reponses as a Python Dictionary: {"entailed": "...", "neutral": "...", "contradicted": "..."}'''
def build_statement_to_response_nli_dataset(character, relevant_query_dataset, model_engine):
if os.path.exists(f"statement/{character}.nli.json"):
return json.load(open(f"statement/{character}.nli.json"))
system_prompt = "You are a helpful agent to build AI characters, your job is show possible responses that the given persona statement is entailed, neutral, contradicted to them in natural language inference."
new_dataset = []
n_nli = 0
dataset = relevant_query_dataset
bar = tqdm(dataset)
for data in bar:
statement = data["statement"]
_dataset = [_data for _data in dataset if _data != data]
for query in data["queries"]:
try:
prompt = fill_in_nli_generation_template(character, statement, query)
response = openai.ChatCompletion.create(
model=model_engine,
temperature=1.0,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
).choices[0]['message']["content"]
nli = json.loads(re.findall(r"({.*?})", response.replace("\n", ""))[0])
nli = {key.lower():nli[key] for key in nli if key.lower() in ["entailed", "neutral", "contradicted"]}
new_data = {"character": character, "statement": statement, "query": query, "nli": nli}
n_nli += len(nli)
new_dataset.append(new_data)
bar.set_description(f"Generating NLI Data... Number of NLI Data: {n_nli}")
except:
pass
json.dump(new_dataset, open(f"statement/{character}.nli.json", "w"))
return new_dataset
def fill_in_brief_nli_discrimination_template(character, statement, query, response):
return f'''Character: {character}
Persona Statement: {statement}
User Utterance: {query}
Response: {response}
For this response, is the given persona entailed, neutral, or contradict to it in natural language inference? Only answer "entailed", "neutral" or "contradict" without any explanation.'''
def fill_in_nli_discrimination_template(character, statement, query, response):
return f'''Explanation:
entailed: the response correctly follows the information in the persona statement,
neutral: the response might be correct but lacks the information in the persona statement,
contradict: the response might be partially correct but contains partial hallucination according to the persona statement,
---
Character: {character}
Persona Statement: {statement}
User Utterance: {query}
Response: {response}
For this response, is the given persona entailed, neutral, or contradict to it in natural language inference? Only answer "entailed", "neutral" or "contradicted" without any explanation.'''
def discriminate_statement_to_response_nli_dataset(character, statement_to_response_nli_dataset, model_engine):
if os.path.exists(f"statement/{character}.nli.v2.json"):
return json.load(open(f"statement/{character}.nli.v2.json"))
system_prompt = "You are a helpful agent to build AI characters, your job is to discriminate whether the given persona statement is entailed, neutral, contradict to the response in natural language inference."
dataset = statement_to_response_nli_dataset
new_dataset = []
bar = tqdm(dataset)
for data in bar:
character, statement, query = data["character"], data["statement"], data["query"]
for label in data["nli"]:
try:
responses = data["nli"]
response = responses[label.lower()]
prompt = fill_in_nli_discrimination_template(character, statement, query, response)
new_label = openai.ChatCompletion.create(
model=model_engine,
temperature=0.0,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
).choices[0]['message']["content"].lower()
new_data = {"character": character, "statement": statement, "query": query, "response": response, "label": new_label}
new_dataset.append(new_data)
except:
pass
_dataset = np.random.choice([_data for _data in dataset if _data != data], min(len(dataset)-1, 3), replace=False)
for _data in _dataset:
try:
_query = _data["query"]
responses = _data["nli"]
_response = responses[np.random.choice(["contradicted", "neutral", "entailed"])]
prompt = fill_in_nli_discrimination_template(character, statement, _query, _response)
new_label = openai.ChatCompletion.create(
model=model_engine,
temperature=0.0,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
).choices[0]['message']["content"].lower()
new_data = {"character": character, "statement": statement, "query": _query, "response": _response, "label": new_label}
new_dataset.append(new_data)
except:
pass
bar.set_description(f"Generating NLI V2 Data... Number of NLI Data: {len(new_dataset)}")
json.dump(new_dataset, open(f"statement/{character}.nli.v2.json", "w"))
return new_dataset