-
Notifications
You must be signed in to change notification settings - Fork 766
/
Copy pathgpt2_tinystories_fr_scratch.py
100 lines (87 loc) · 2.24 KB
/
gpt2_tinystories_fr_scratch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from datasets import load_dataset
from transformers import (
GPT2Config,
GPT2LMHeadModel,
GPT2TokenizerFast,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
import numpy as np
import math
from torch.utils.data import Dataset
# Load dataset
dataset = load_dataset("roneneldan/TinyStories")
# Initialize tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
# Define model configuration
config = GPT2Config(
vocab_size=tokenizer.vocab_size,
n_positions=512,
n_embd=256,
n_layer=6,
n_head=8,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id
)
# Initialize model with random weights
model = GPT2LMHeadModel(config)
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
max_length=config.n_positions,
padding="max_length"
)
# Tokenize datasets
tokenized_train = dataset["train"].map(
tokenize_function,
batched=True,
remove_columns=dataset["train"].column_names,
num_proc=4 # Parallel processing
)
tokenized_test = dataset["validation"].map(
tokenize_function,
batched=True,
remove_columns=dataset["validation"].column_names,
num_proc=4
)
# Create data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
# Define training arguments
training_args = TrainingArguments(
output_dir="./gpt2-tinystories",
num_train_epochs=5,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
gradient_accumulation_steps=8,
#evaluation_strategy="steps",
#eval_steps=500,
save_steps=1000,
warmup_steps=500,
learning_rate=1e-4,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=100,
#load_best_model_at_end=True,
#metric_for_best_model="perplexity",
#greater_is_better=False
)
# Initialize trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
#eval_dataset=tokenized_test,
data_collator=data_collator,
#compute_metrics=compute_metrics
)
# Train model
trainer.train()
# Save model
trainer.save_model("./gpt2-tinystories-fr-scratch-final")