-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_instructions.py
66 lines (51 loc) · 2.29 KB
/
run_instructions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import json
from argparse import ArgumentParser
from tqdm import tqdm
from transformers import AutoTokenizer
from datasets import load_dataset
from mastermind.models import AnthropicModel, HFModel, OpenAIModel
from mastermind.utils import parse_guess, make_output_path
def evaluate(model, dataset, args):
log = []
for dp in tqdm(dataset, desc="Evaluating."):
try:
out = model(dp["instruction"])
guess = parse_guess(out[-1])
log.append(
{
"guess": guess,
"secret_code": dp["secret_code"],
"correct": guess == dp["secret_code"],
"valid": len(guess) == len(dp["secret_code"]),
}
)
except Exception as e:
log.append({"guess": out[-1], "secret_code": dp["secret_code"], "correct": False, "valid": False})
output_path = make_output_path(game_type="instructions")
with open(output_path / "results.jsonl", "w") as f:
for item in log:
f.write(json.dumps(item) + "\n")
with open(output_path / "info.json", "w") as f:
json.dump({"model": model.get_model_info(), "dataset": args.dataset}, f)
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--model_type", type=str, default="hf")
parser.add_argument("--model", type=str, default="Qwen/Qwen2.5-1.5B-Instruct")
parser.add_argument("--dataset", default="flair/mastermind_24_prompt")
arguments = parser.parse_args()
dataset = load_dataset(arguments.dataset, split="train")
dataset = dataset.shuffle(seed=42).select(range(100))
def apply_chat_template(example):
example['instruction'] = [{"role": "user", "content": example['instruction']}]
return example
dataset = dataset.map(apply_chat_template)
if arguments.model_type == "hf":
model = HFModel(model_name=arguments.model)
tokenizer = AutoTokenizer.from_pretrained(arguments.model)
elif arguments.model_type == "anthropic":
model = AnthropicModel(model_name=arguments.model)
elif arguments.model_type == "openai":
model = OpenAIModel(model_name=arguments.model)
else:
raise ValueError(f"Invalid model type: {arguments.model_type}")
evaluate(model, dataset, num_runs=100)