Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ def main(cfg):
schedulers=scheduler,
max_duration=cfg.max_duration,
eval_interval=cfg.eval_interval,
eval_subset_num_batches=cfg.eval_loader.get('eval_subset_num_batches', -1),
progress_bar=cfg.progress_bar,
log_to_console=cfg.log_to_console,
loggers=loggers,
Expand All @@ -186,6 +187,8 @@ def main(cfg):
print("Starting training...")
trainer.fit()

print("Done.")


if __name__ == '__main__':
yaml_path, args_list = sys.argv[1], sys.argv[2:]
Expand Down
22 changes: 22 additions & 0 deletions llm/tests/c4_data_prep_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright 2022 MosaicML Benchmarks authors
# SPDX-License-Identifier: Apache-2.0

import os
import shutil
from argparse import Namespace

from convert_c4 import main


def test_download_script_from_api():
# test calling it directly
main(Namespace(**{'splits': ['val'], 'out_root': './my-copy-c4-1'}))
assert os.path.exists(os.path.join(os.getcwd(), 'my-copy-c4-1'))
shutil.rmtree(os.path.join(os.getcwd(), 'my-copy-c4-1'))


def test_download_script_from_cmdline():
# test calling it via the cmd line interface
os.system("python convert_c4.py --out_root ./my-copy-c4-2 --splits val")
assert os.path.exists(os.path.join(os.getcwd(), 'my-copy-c4-2'))
shutil.rmtree(os.path.join(os.getcwd(), 'my-copy-c4-2'))
35 changes: 35 additions & 0 deletions llm/tests/dataloader_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright 2022 MosaicML Benchmarks authors
# SPDX-License-Identifier: Apache-2.0

import os
import pytest
import torch
from omegaconf import OmegaConf as om

from src.data_c4 import build_c4_dataloader


def get_config(conf_path="yamls/mosaic_gpt/125m.yaml"):
os.environ["TOKENIZERS_PARALLELISM"] = "false"
with open(conf_path) as f:
test_cfg = om.load(f)
return test_cfg


def test_correct_padding(batch_size=32):
if not os.path.isdir('./my-copy-c4/val'):
pytest.xfail("c4 dataset not set up as expected")

test_cfg = get_config(conf_path="yamls/mosaic_gpt/125m.yaml")

# Dataloaders
eval_loader = build_c4_dataloader(test_cfg.eval_loader, batch_size)
batch = next(iter(eval_loader))

assert batch['input_ids'].shape == torch.Size([batch_size, 2048])
assert batch['input_ids'].type() == 'torch.LongTensor'

# we follow the convention (from huggingface) that non-attended tokens are 0 in the attn mask and -100 in the labels
a = batch['attention_mask'] == 0
b = batch['labels'] == -100
assert torch.equal(a, b)
165 changes: 165 additions & 0 deletions llm/tests/model_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# Copyright 2022 MosaicML Benchmarks authors
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
import torch
import torch.nn as nn
from omegaconf import OmegaConf as om
from composer.utils import reproducibility
from composer.optim import DecoupledAdamW

from src.tokenizer import TOKENIZER_REGISTRY
from src.model_registry import COMPOSER_MODEL_REGISTRY


def get_config(conf_path="yamls/mosaic_gpt/125m.yaml"):
os.environ["TOKENIZERS_PARALLELISM"] = "false"
print(conf_path)
with open(conf_path) as f:
test_cfg = om.load(f)
return test_cfg


def get_objs(conf_path="yamls/mosaic_gpt/125m.yaml"):
warnings.filterwarnings(action='ignore', message='Torchmetrics v0.9 introduced a new argument class property')
test_cfg = get_config(conf_path=conf_path)
tokenizer = TOKENIZER_REGISTRY[test_cfg.tokenizer.type](**test_cfg.tokenizer.args)

reproducibility.seed_all(test_cfg.seed)

# Read FSDP Config as a dict
fsdp_config = test_cfg.get('fsdp_config', None)
fsdp_config = om.to_container(fsdp_config, resolve=True) if fsdp_config else None

# Build Model
# For fast initialization, use `meta` device
print('Initializing model...')
device = 'cpu'
test_cfg.precision = 'fp32'
test_cfg.model.attn_impl = 'torch'
# device = 'cuda'
# test_cfg.precision = 'amp'
test_cfg.model.device = device
test_cfg.device = device

test_cfg.global_train_batch_size = 2
test_cfg.device_eval_batch_size = 2
test_cfg.device_train_microbatch_size = 2

model = COMPOSER_MODEL_REGISTRY[test_cfg.model.name](test_cfg.model)
# Optimizer
assert test_cfg.optimizer.name == 'decoupled_adamw'
optimizer = DecoupledAdamW(
model.parameters(),
lr=test_cfg.optimizer.lr,
betas=test_cfg.optimizer.betas,
eps=test_cfg.optimizer.eps,
weight_decay=test_cfg.optimizer.weight_decay)

return test_cfg, model, optimizer


def gen_random_batch(batch_size, test_cfg):
# generate input batch of random data
batch = {}
batch['input_ids'] = torch.randint(low=0, high=test_cfg.model.vocab_size, size=(batch_size, test_cfg.max_seq_len)).to(test_cfg.device)
batch['labels'] = torch.randint(low=0, high=test_cfg.model.vocab_size, size=(batch_size, test_cfg.max_seq_len)).to(test_cfg.device)
batch['attention_mask'] = torch.ones(size=(batch_size, test_cfg.max_seq_len), dtype=torch.int64).to(test_cfg.device)
return batch


def test_full_forward_and_backward(batch_size=2):
test_cfg, model, optimizer = get_objs(conf_path="yamls/mosaic_gpt/125m.yaml")

batch = gen_random_batch(batch_size, test_cfg)

assert batch['input_ids'].shape == torch.Size([batch_size, test_cfg.max_seq_len])
model.train()
original_params = next(model.parameters()).clone().data
outputs = model(batch)
loss = model.loss(outputs, batch)
loss.backward()
optimizer.step()
updated_params = next(model.parameters()).clone().data
assert not torch.equal(original_params, updated_params)


def test_attention_mechanism(batch_size=2):
test_cfg, model, _ = get_objs(conf_path="yamls/mosaic_gpt/125m.yaml")

batch = gen_random_batch(batch_size, test_cfg)

model.eval()
# run a partial forward where we explicitly inspect the attention_mask from the causal_attn block
input_ids, key_padding_mask = batch['input_ids'], batch['attention_mask'].bool()

_, S = input_ids.size()
assert (
S <= test_cfg.max_seq_len
), f"Cannot forward input with seq_len={S}, this model only supports seq_len<={test_cfg.max_seq_len}"
pos = torch.arange(
0, S, dtype=torch.long,
device=input_ids.device).unsqueeze(0)

tok_emb = model.model.transformer.wte(input_ids)
pos_emb = model.model.transformer.wpe(pos)
x = model.model.transformer.emb_drop(tok_emb + pos_emb)

# basically the attention mask should be a tensor shape (bsz, seqlen, seqlen)
# wih -inf along the upper triangle as well as wherever there are any pad tokens
# and with 0 everywhere else
expected_zerod_weights = nn.Transformer.generate_square_subsequent_mask(test_cfg.max_seq_len)\
.reshape(1, test_cfg.max_seq_len, test_cfg.max_seq_len)
expected_zerod_weights = torch.isneginf(torch.cat(
batch_size*[expected_zerod_weights]
))
torch_key_padding = torch.cat(
test_cfg.max_seq_len*[(~key_padding_mask).reshape(batch_size, 1, test_cfg.max_seq_len)],
axis=1)
expected_zerod_weights |= torch_key_padding

for block in model.model.transformer.blocks:
a = block.ln_1(x)
b, attention_weights = block.causal_attn(a, key_padding_mask)

zerod_weights = (attention_weights == 0)
assert torch.equal(expected_zerod_weights, zerod_weights)
x = x + block.resid_attn_dropout(b)
m = block.ln_2(x)
n = block.mlp(m)
x = x + block.resid_mlp_dropout(n)


def test_full_forward_and_backward_gpt_neo(batch_size=2):
warnings.filterwarnings(action='ignore', message='Torchmetrics v0.9 introduced a new argument class property')
conf_path = "yamls/hf_causal_lm/gpt-neo-125m.yaml"
with open(conf_path) as f:
neo_cfg = om.load(f)

device = 'cpu'
neo_cfg.device = device

model = COMPOSER_MODEL_REGISTRY[neo_cfg.model.name](neo_cfg.model).to(device)

assert neo_cfg.optimizer.name == 'decoupled_adamw'
optimizer = DecoupledAdamW(
model.parameters(),
lr=neo_cfg.optimizer.lr,
betas=neo_cfg.optimizer.betas,
eps=neo_cfg.optimizer.eps,
weight_decay=neo_cfg.optimizer.weight_decay)

# set vacab size using model num_embeddings
neo_cfg.model.vocab_size = model.model.transformer.wte.num_embeddings
batch = gen_random_batch(batch_size, neo_cfg)

batch['input_ids'].shape == torch.Size([batch_size, neo_cfg.max_seq_len])
model.train()
original_params = next(model.parameters()).clone().data
outputs = model(batch)
loss = model.loss(outputs, batch)
loss.backward()
optimizer.step()
updated_params = next(model.parameters()).clone().data
assert not torch.equal(original_params, updated_params)
66 changes: 66 additions & 0 deletions llm/tests/tokenizer_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright 2022 MosaicML Benchmarks authors
# SPDX-License-Identifier: Apache-2.0

from omegaconf import OmegaConf as om

from src.tokenizer import TOKENIZER_REGISTRY


def get_config(conf_path="yamls/mosaic_gpt/125m.yaml"):
with open(conf_path) as f:
test_cfg = om.load(f)
return test_cfg


def test_load_tokenizer():
test_cfg = get_config(conf_path="yamls/mosaic_gpt/125m.yaml")
truncation = True
padding = 'max_length'

tokenizer = TOKENIZER_REGISTRY[test_cfg.tokenizer.type](**test_cfg.tokenizer.args)
assert tokenizer.tokenizer.vocab_size == 50257
assert tokenizer.tokenizer.name_or_path == 'gpt2'

in_str = "hello\n\nhello"
out_token_key = [31373, 198, 198, 31373]

# test explicitly call tokenizer
out = tokenizer.tokenizer.encode(in_str)
assert out == out_token_key

# tokenizer __call__
out = tokenizer.tokenizer(in_str)['input_ids']
assert out == out_token_key

# tokenizer __call__ with kwargs
padded_tokenize = tokenizer.tokenizer(
in_str,
truncation=truncation,
padding=padding,
max_length=tokenizer.max_seq_len
)['input_ids']
out_pad_tokens = out_token_key + [50256] * (tokenizer.max_seq_len - 4)
assert padded_tokenize == out_pad_tokens

# wrapper class __call__
out = tokenizer(in_str)['input_ids']
assert out == out_token_key

# wrapper class __call__ with kwargs
padded_tokenize = tokenizer(
in_str,
truncation=truncation,
padding=padding,
max_length=tokenizer.max_seq_len
)['input_ids']
assert padded_tokenize == out_pad_tokens

# check attn mask
attention_mask = tokenizer(
in_str,
truncation=truncation,
padding=padding,
max_length=tokenizer.max_seq_len
)['attention_mask']
attn_mask_key = [1, 1, 1, 1] + [0] * (tokenizer.max_seq_len - 4)
assert attention_mask == attn_mask_key
64 changes: 64 additions & 0 deletions llm/tests/training_integration_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Copyright 2022 MosaicML Benchmarks authors
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
import pytest
import torch
from omegaconf import OmegaConf as om

from main import main


def gpt_tiny_cfg(conf_path="yamls/mosaic_gpt/125m.yaml"):
""" create gpt tiny cfg """

with open(conf_path) as f:
test_cfg = om.load(f)
# removes requirement to download / process train set
test_cfg.train_loader.dataset = test_cfg.eval_loader.dataset

test_cfg.global_train_batch_size = 8
test_cfg.device_eval_batch_size = 4
test_cfg.device_train_microbatch_size = 4

test_cfg.max_duration = '4ba'
test_cfg.eval_interval = '4ba'
test_cfg.eval_loader.eval_subset_num_batches = 2
test_cfg.save_interval = '4ba'
test_cfg.run_name = 'gpt-mini-integration-test'
test_cfg.model.d_model = 32
test_cfg.model.n_heads = 2
test_cfg.model.n_layers = 2
test_cfg.max_seq_len = 256
test_cfg.model.max_seq_len = test_cfg.max_seq_len
test_cfg.tokenizer.args.max_seq_len = test_cfg.max_seq_len
test_cfg.train_loader.dataset.max_seq_len = test_cfg.max_seq_len
test_cfg.eval_loader.dataset.max_seq_len = test_cfg.max_seq_len

return test_cfg


@pytest.mark.parametrize(
'device',
[
'cpu',
pytest.param(
'cuda',
marks=pytest.mark.skipif(not torch.cuda.is_available(), reason="testing with cuda requires GPU")
),
])
def test_train(device):
if not os.path.isdir('./my-copy-c4/val'):
pytest.xfail("c4 dataset not set up as expected")

warnings.filterwarnings(action='ignore', category=DeprecationWarning, message="Using the 'grad_clip_norm' field in Trainer is deprecated. Please usethe GradientClipping Algorithm in composer.algorithms.gradient_clipping.")

test_cfg = gpt_tiny_cfg(conf_path="yamls/mosaic_gpt/125m.yaml")

if device == 'cpu':
test_cfg.model.device = 'cpu'
test_cfg.model.attn_impl = 'torch'
test_cfg.precision = 'fp32'

main(test_cfg)