-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinetune.py
executable file
·122 lines (96 loc) · 3.36 KB
/
finetune.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
import argparse
import string
from datasets import load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
DataCollatorForLanguageModeling,
GenerationConfig,
Trainer,
TrainerCallback,
TrainingArguments,
)
def make_parser():
parser = argparse.ArgumentParser()
parser.add_argument("--fast", action="store_true")
parser.add_argument("--val_set_size", type=int, default=10)
return parser
class MyCallback(TrainerCallback):
def __init__(self, fast: bool):
self.fast = fast
def on_evaluate(self, args, state, control, **kwargs):
m = kwargs["model"]
t = kwargs["tokenizer"]
prompt = "Once upon a time there was a child named"
input_ids = t.encode(prompt, return_tensors="pt")
if self.fast:
input_ids = input_ids.cuda()
output = m.generate(
input_ids,
max_length=100,
num_beams=1,
generation_config=GenerationConfig(do_sample=True, temperature=1.0),
)
output_text = t.decode(output[0], skip_special_tokens=True)
print(output_text)
def einsteinify(story):
"""Given a story, try to replace a
character's name with Einstein"""
words = story.split()
if "named" not in words:
return story
i = words.index("named")
# Edge case that might never happen
if i + 1 == len(words):
return story
name_maybe_punctuated = words[i + 1]
name = "".join(c for c in name_maybe_punctuated if c in string.ascii_letters)
return story.replace(name, "Einstein")
def apply_einsteinify(example):
return {"text": einsteinify(example["text"])}
def main(user_args):
model = AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-33M")
if user_args.fast:
model.cuda()
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
d = load_dataset("roneneldan/TinyStories")
d["train"] = d["train"].select(range(1000))
d["validation"] = d["validation"].select(range(user_args.val_set_size))
altered_datasets = d.map(apply_einsteinify).filter(lambda ex: "Einstein" in ex["text"])
def tokenize(example):
return {"input_ids": tokenizer(example["text"])["input_ids"]}
tokenized_datasets = altered_datasets.map(tokenize)
args = TrainingArguments(
output_dir="/tmp/results",
per_device_train_batch_size=2 if user_args.fast else 1,
per_device_eval_batch_size=4 if user_args.fast else 1,
evaluation_strategy="steps",
eval_steps=5,
gradient_accumulation_steps=1,
num_train_epochs=1,
weight_decay=0.1,
lr_scheduler_type="constant",
learning_rate=5e-5,
save_steps=5,
fp16=True,
push_to_hub=False,
max_steps=20,
)
print(tokenized_datasets)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=args,
data_collator=data_collator,
callbacks=[MyCallback(user_args.fast)],
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
)
trainer.train()
if __name__ == "__main__":
parser = make_parser()
user_args = parser.parse_args()
main(user_args)