forked from EleutherAI/lm-evaluation-harness
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathopenai_generate.py
75 lines (62 loc) · 2.78 KB
/
openai_generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import argparse
import os
import time
import math
import openai
import json
from tqdm import tqdm
from datasets import load_dataset
# The private OpenAI API key needs to be an environment variable
openai.api_key = os.getenv('OPENAI_API_KEY')
# As instructed here: https://community.openai.com/t/token-logprobs-when-echo-is-true/9626/2
# "Transformer models don’t predict the probability of the first token. If you want to get the probability
# for your first token you can try to use <|endoftext|> as the first token as a workaround."
endoftext_token = '<|endoftext|>'
def oa_completion(**kwargs):
""" Query OpenAI API for completion.
Retry with back-off until they respond
"""
import openai
backoff_time = 3
while True:
try:
return openai.Completion.create(**kwargs)
except openai.error.OpenAIError:
import traceback
traceback.print_exc()
time.sleep(backoff_time)
backoff_time *= 1.5
def call_codex(code_str, temperature):
eos_code_str = code_str
# engine: 'davinci-codex' is currently the best codex model
# max_tokens=0 means that we don't want the model to generate additional tokens
# logprobs=0 means that we don't want the logprobs of the alternative tokens, only the actual tokens
# echo=True means that we want the model to echo our prompt, in addition to our (not existing) completion
completion = oa_completion(engine="code-davinci-001", prompt=eos_code_str,
max_tokens=256,
temperature=temperature,
top_p=0.95,
n=100,
echo=True)
completed_code = [c['text'] for c in completion.choices]
return completed_code
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--output', type=str, required=False, default=os.devnull)
args = parser.parse_args()
results = {}
# Load evaluation dataset and metric
human_eval = load_dataset("openai_humaneval")
# Generate completions for evaluation set
n_tasks = len(human_eval["test"])
prompts = []
solutions = []
for task in range(n_tasks):
prompts.append(human_eval["test"][task]["prompt"])
solutions.append(human_eval["test"][task]["canonical_solution"])
print('Loaded HumanEval:', len(prompts))
for temp in [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]:
with open('code-davinci-001.' + str(temp) + '.jsonl', 'w') as out_file:
for idx, (prompt, solution) in tqdm(enumerate(zip(prompts, solutions))):
completed_code = call_codex(prompt, temp)
out_file.write(json.dumps(completed_code) + '\n')