mosaicml · vchiley · Nov 29, 2022 · Nov 22, 2022 · Nov 22, 2022 · Nov 22, 2022
diff --git a/llm/main.py b/llm/main.py
@@ -165,6 +165,7 @@ def main(cfg):
         schedulers=scheduler,
         max_duration=cfg.max_duration,
         eval_interval=cfg.eval_interval,
+        eval_subset_num_batches=cfg.eval_loader.get('eval_subset_num_batches', -1),
         progress_bar=cfg.progress_bar,
         log_to_console=cfg.log_to_console,
         loggers=loggers,
@@ -186,6 +187,8 @@ def main(cfg):
     print("Starting training...")
     trainer.fit()
 
+    print("Done.")
+
 
 if __name__ == '__main__':
     yaml_path, args_list = sys.argv[1], sys.argv[2:]

diff --git a/llm/tests/c4_data_prep_script.py b/llm/tests/c4_data_prep_script.py
@@ -0,0 +1,22 @@
+# Copyright 2022 MosaicML Benchmarks authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import shutil
+from argparse import Namespace
+
+from convert_c4 import main
+
+
+def test_download_script_from_api():
+    # test calling it directly
+    main(Namespace(**{'splits': ['val'], 'out_root': './my-copy-c4-1'}))
+    assert os.path.exists(os.path.join(os.getcwd(), 'my-copy-c4-1'))
+    shutil.rmtree(os.path.join(os.getcwd(), 'my-copy-c4-1'))
+
+
+def test_download_script_from_cmdline():
+    # test calling it via the cmd line interface
+    os.system("python convert_c4.py --out_root ./my-copy-c4-2 --splits val")
+    assert os.path.exists(os.path.join(os.getcwd(), 'my-copy-c4-2'))
+    shutil.rmtree(os.path.join(os.getcwd(), 'my-copy-c4-2'))
diff --git a/llm/tests/dataloader_tests.py b/llm/tests/dataloader_tests.py
@@ -0,0 +1,35 @@
+# Copyright 2022 MosaicML Benchmarks authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import pytest
+import torch
+from omegaconf import OmegaConf as om
+
+from src.data_c4 import build_c4_dataloader
+
+
+def get_config(conf_path="yamls/mosaic_gpt/125m.yaml"):
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    with open(conf_path) as f:
+        test_cfg = om.load(f)
+    return test_cfg
+
+
+def test_correct_padding(batch_size=32):
+    if not os.path.isdir('./my-copy-c4/val'):
+        pytest.xfail("c4 dataset not set up as expected")
+
+    test_cfg = get_config(conf_path="yamls/mosaic_gpt/125m.yaml")
+
+    # Dataloaders
+    eval_loader = build_c4_dataloader(test_cfg.eval_loader, batch_size)
+    batch = next(iter(eval_loader))
+
+    assert batch['input_ids'].shape == torch.Size([batch_size, 2048])
+    assert batch['input_ids'].type() == 'torch.LongTensor'
+
+    # we follow the convention (from huggingface) that non-attended tokens are 0 in the attn mask and -100 in the labels
+    a =  batch['attention_mask'] == 0
+    b =  batch['labels'] == -100
+    assert torch.equal(a, b)
diff --git a/llm/tests/model_tests.py b/llm/tests/model_tests.py
@@ -0,0 +1,165 @@
+# Copyright 2022 MosaicML Benchmarks authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import warnings
+import torch
+import torch.nn as nn
+from omegaconf import OmegaConf as om
+from composer.utils import reproducibility
+from composer.optim import DecoupledAdamW
+
+from src.tokenizer import TOKENIZER_REGISTRY
+from src.model_registry import COMPOSER_MODEL_REGISTRY
+
+
+def get_config(conf_path="yamls/mosaic_gpt/125m.yaml"):
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    print(conf_path)
+    with open(conf_path) as f:
+        test_cfg = om.load(f)
+    return test_cfg
+
+
+def get_objs(conf_path="yamls/mosaic_gpt/125m.yaml"):
+    warnings.filterwarnings(action='ignore', message='Torchmetrics v0.9 introduced a new argument class property')
+    test_cfg = get_config(conf_path=conf_path)
+    tokenizer = TOKENIZER_REGISTRY[test_cfg.tokenizer.type](**test_cfg.tokenizer.args)
+
+    reproducibility.seed_all(test_cfg.seed)
+
+    # Read FSDP Config as a dict
+    fsdp_config = test_cfg.get('fsdp_config', None)
+    fsdp_config = om.to_container(fsdp_config, resolve=True) if fsdp_config else None
+
+    # Build Model
+    # For fast initialization, use `meta` device
+    print('Initializing model...')
+    device = 'cpu'
+    test_cfg.precision = 'fp32'
+    test_cfg.model.attn_impl = 'torch'
+    # device = 'cuda'
+    # test_cfg.precision = 'amp'
+    test_cfg.model.device = device
+    test_cfg.device = device
+
+    test_cfg.global_train_batch_size = 2
+    test_cfg.device_eval_batch_size = 2
+    test_cfg.device_train_microbatch_size = 2
+
+    model = COMPOSER_MODEL_REGISTRY[test_cfg.model.name](test_cfg.model)
+    # Optimizer
+    assert test_cfg.optimizer.name == 'decoupled_adamw'
+    optimizer = DecoupledAdamW(
+        model.parameters(),
+        lr=test_cfg.optimizer.lr,
+        betas=test_cfg.optimizer.betas,
+        eps=test_cfg.optimizer.eps,
+        weight_decay=test_cfg.optimizer.weight_decay)
+
+    return test_cfg, model, optimizer
+
+
+def gen_random_batch(batch_size, test_cfg):
+    # generate input batch of random data
+    batch = {}
+    batch['input_ids']      = torch.randint(low=0, high=test_cfg.model.vocab_size, size=(batch_size, test_cfg.max_seq_len)).to(test_cfg.device)
+    batch['labels']         = torch.randint(low=0, high=test_cfg.model.vocab_size, size=(batch_size, test_cfg.max_seq_len)).to(test_cfg.device)
+    batch['attention_mask'] = torch.ones(size=(batch_size, test_cfg.max_seq_len), dtype=torch.int64).to(test_cfg.device)
+    return batch
+
+
+def test_full_forward_and_backward(batch_size=2):
+    test_cfg, model, optimizer = get_objs(conf_path="yamls/mosaic_gpt/125m.yaml")
+
+    batch = gen_random_batch(batch_size, test_cfg)
+
+    assert batch['input_ids'].shape == torch.Size([batch_size, test_cfg.max_seq_len])
+    model.train()
+    original_params = next(model.parameters()).clone().data
+    outputs = model(batch)
+    loss = model.loss(outputs, batch)
+    loss.backward()
+    optimizer.step()
+    updated_params = next(model.parameters()).clone().data
+    assert not torch.equal(original_params, updated_params)
+
+
+def test_attention_mechanism(batch_size=2):
+    test_cfg, model, _ = get_objs(conf_path="yamls/mosaic_gpt/125m.yaml")
+
+    batch = gen_random_batch(batch_size, test_cfg)
+
+    model.eval()
+    # run a partial forward where we explicitly inspect the attention_mask from the causal_attn block
+    input_ids, key_padding_mask = batch['input_ids'], batch['attention_mask'].bool()
+
+    _, S = input_ids.size()
+    assert (
+        S <= test_cfg.max_seq_len
+    ), f"Cannot forward input with seq_len={S}, this model only supports seq_len<={test_cfg.max_seq_len}"
+    pos = torch.arange(
+        0, S, dtype=torch.long,
+        device=input_ids.device).unsqueeze(0)
+
+    tok_emb = model.model.transformer.wte(input_ids)
+    pos_emb = model.model.transformer.wpe(pos)
+    x = model.model.transformer.emb_drop(tok_emb + pos_emb)
+
+    # basically the attention mask should be a tensor shape (bsz, seqlen, seqlen)
+    # wih -inf along the upper triangle as well as wherever there are any pad tokens
+    # and with 0 everywhere else
+    expected_zerod_weights = nn.Transformer.generate_square_subsequent_mask(test_cfg.max_seq_len)\
+        .reshape(1, test_cfg.max_seq_len, test_cfg.max_seq_len)
+    expected_zerod_weights = torch.isneginf(torch.cat(
+        batch_size*[expected_zerod_weights]
+    ))
+    torch_key_padding = torch.cat(
+        test_cfg.max_seq_len*[(~key_padding_mask).reshape(batch_size, 1, test_cfg.max_seq_len)], 
+        axis=1)
+    expected_zerod_weights |= torch_key_padding
+
+    for block in model.model.transformer.blocks:
+        a = block.ln_1(x)
+        b, attention_weights = block.causal_attn(a, key_padding_mask)
+
+        zerod_weights = (attention_weights == 0)
+        assert torch.equal(expected_zerod_weights, zerod_weights)
+        x = x + block.resid_attn_dropout(b)
+        m = block.ln_2(x)
+        n = block.mlp(m)
+        x = x + block.resid_mlp_dropout(n)
+
+
+def test_full_forward_and_backward_gpt_neo(batch_size=2):
+    warnings.filterwarnings(action='ignore', message='Torchmetrics v0.9 introduced a new argument class property')
+    conf_path = "yamls/hf_causal_lm/gpt-neo-125m.yaml"
+    with open(conf_path) as f:
+        neo_cfg = om.load(f)
+
+    device = 'cpu'
+    neo_cfg.device = device
+
+    model = COMPOSER_MODEL_REGISTRY[neo_cfg.model.name](neo_cfg.model).to(device)
+
+    assert neo_cfg.optimizer.name == 'decoupled_adamw'
+    optimizer = DecoupledAdamW(
+        model.parameters(),
+        lr=neo_cfg.optimizer.lr,
+        betas=neo_cfg.optimizer.betas,
+        eps=neo_cfg.optimizer.eps,
+        weight_decay=neo_cfg.optimizer.weight_decay)
+
+    # set vacab size using model num_embeddings
+    neo_cfg.model.vocab_size = model.model.transformer.wte.num_embeddings
+    batch = gen_random_batch(batch_size, neo_cfg)
+
+    batch['input_ids'].shape == torch.Size([batch_size, neo_cfg.max_seq_len])
+    model.train()
+    original_params = next(model.parameters()).clone().data
+    outputs = model(batch)
+    loss = model.loss(outputs, batch)
+    loss.backward()
+    optimizer.step()
+    updated_params = next(model.parameters()).clone().data
+    assert not torch.equal(original_params, updated_params)
diff --git a/llm/tests/tokenizer_tests.py b/llm/tests/tokenizer_tests.py
@@ -0,0 +1,66 @@
+# Copyright 2022 MosaicML Benchmarks authors
+# SPDX-License-Identifier: Apache-2.0
+
+from omegaconf import OmegaConf as om
+
+from src.tokenizer import TOKENIZER_REGISTRY
+
+
+def get_config(conf_path="yamls/mosaic_gpt/125m.yaml"):
+    with open(conf_path) as f:
+        test_cfg = om.load(f)
+    return test_cfg
+
+
+def test_load_tokenizer():
+    test_cfg = get_config(conf_path="yamls/mosaic_gpt/125m.yaml")
+    truncation = True
+    padding = 'max_length'
+
+    tokenizer = TOKENIZER_REGISTRY[test_cfg.tokenizer.type](**test_cfg.tokenizer.args)
+    assert tokenizer.tokenizer.vocab_size == 50257
+    assert tokenizer.tokenizer.name_or_path == 'gpt2'
+
+    in_str = "hello\n\nhello"
+    out_token_key = [31373, 198, 198, 31373]
+
+    # test explicitly call tokenizer
+    out = tokenizer.tokenizer.encode(in_str)
+    assert out == out_token_key
+
+    # tokenizer __call__
+    out = tokenizer.tokenizer(in_str)['input_ids']
+    assert out == out_token_key
+
+    # tokenizer  __call__ with kwargs
+    padded_tokenize = tokenizer.tokenizer(
+        in_str, 
+        truncation=truncation,
+        padding=padding,
+        max_length=tokenizer.max_seq_len
+    )['input_ids']
+    out_pad_tokens = out_token_key + [50256] * (tokenizer.max_seq_len - 4)
+    assert padded_tokenize == out_pad_tokens
+
+    # wrapper class __call__
+    out = tokenizer(in_str)['input_ids']
+    assert out == out_token_key
+
+    # wrapper class __call__ with kwargs
+    padded_tokenize = tokenizer(
+        in_str, 
+        truncation=truncation,
+        padding=padding,
+        max_length=tokenizer.max_seq_len
+    )['input_ids']
+    assert padded_tokenize == out_pad_tokens
+
+    # check attn mask
+    attention_mask = tokenizer(
+        in_str,
+        truncation=truncation,
+        padding=padding,
+        max_length=tokenizer.max_seq_len
+    )['attention_mask']
+    attn_mask_key = [1, 1, 1, 1] + [0] * (tokenizer.max_seq_len - 4)
+    assert attention_mask == attn_mask_key
diff --git a/llm/tests/training_integration_tests.py b/llm/tests/training_integration_tests.py
@@ -0,0 +1,64 @@
+# Copyright 2022 MosaicML Benchmarks authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import warnings
+import pytest
+import torch
+from omegaconf import OmegaConf as om
+
+from main import main
+
+
+def gpt_tiny_cfg(conf_path="yamls/mosaic_gpt/125m.yaml"):
+    """ create gpt tiny cfg """
+
+    with open(conf_path) as f:
+        test_cfg = om.load(f)
+    # removes requirement to download / process train set
+    test_cfg.train_loader.dataset = test_cfg.eval_loader.dataset
+
+    test_cfg.global_train_batch_size = 8
+    test_cfg.device_eval_batch_size = 4
+    test_cfg.device_train_microbatch_size = 4
+
+    test_cfg.max_duration = '4ba'
+    test_cfg.eval_interval = '4ba'
+    test_cfg.eval_loader.eval_subset_num_batches = 2
+    test_cfg.save_interval = '4ba'
+    test_cfg.run_name = 'gpt-mini-integration-test'
+    test_cfg.model.d_model = 32
+    test_cfg.model.n_heads = 2
+    test_cfg.model.n_layers = 2
+    test_cfg.max_seq_len = 256
+    test_cfg.model.max_seq_len = test_cfg.max_seq_len
+    test_cfg.tokenizer.args.max_seq_len = test_cfg.max_seq_len
+    test_cfg.train_loader.dataset.max_seq_len = test_cfg.max_seq_len
+    test_cfg.eval_loader.dataset.max_seq_len = test_cfg.max_seq_len
+
+    return test_cfg
+
+
+@pytest.mark.parametrize(
+    'device',
+    [
+        'cpu',
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(not torch.cuda.is_available(), reason="testing with cuda requires GPU")
+        ),
+    ])
+def test_train(device):
+    if not os.path.isdir('./my-copy-c4/val'):
+        pytest.xfail("c4 dataset not set up as expected")
+
+    warnings.filterwarnings(action='ignore', category=DeprecationWarning, message="Using the 'grad_clip_norm' field in Trainer is deprecated. Please usethe GradientClipping Algorithm in composer.algorithms.gradient_clipping.")
+
+    test_cfg = gpt_tiny_cfg(conf_path="yamls/mosaic_gpt/125m.yaml")
+
+    if device == 'cpu':
+        test_cfg.model.device = 'cpu'
+        test_cfg.model.attn_impl = 'torch'
+        test_cfg.precision = 'fp32'
+
+    main(test_cfg)