Skip to content

Commit

Permalink
Merge branch 'master' into add_training_estimates
Browse files Browse the repository at this point in the history
  • Loading branch information
klei22 authored Aug 24, 2024
2 parents 7487449 + a65c5c3 commit 858bc2f
Show file tree
Hide file tree
Showing 19 changed files with 670 additions and 176 deletions.
28 changes: 28 additions & 0 deletions explorations/consmax_v2_sweep.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[
{
"max_iters": ["3500"],
"n_layer": ["12"],
"n_head": ["6"],
"n_embd": ["384"],
"block_size":["256"],
"device": ["cuda"],
"dtype": ["bfloat16"],
"dataset": ["shakespeare_char"],
"use_gradient_checkpointing": [true],
"use_rotary_embeddings": [false],
"use_abs_pos_embeddings": [true],
"compile": [false],
"softmax_variant_attn": ["consmax_v2"],
"consmax_initial_beta": ["2.5"],
"consmax_initial_gamma": ["100.0"],
"softmax_io_logging": [true],
"create_statistics": [true],
"plot_statistics": [true],
"consmax_per_head": {
"conditions": [["softmax_variant_attn", "consmax_v2"]],
"options": [true, false]
},
"use_post_ln": [true, false]
}
]

26 changes: 13 additions & 13 deletions explorations/strongermax_sweep.json
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
[
{
"max_iters": ["6000"],
"max_iters": ["3500"],
"n_layer": ["6"],
"n_kv_group": ["6"],
"n_head": ["6"],
"n_embd": ["384"],
"block_size":["256"],
"device": ["cuda"],
"dtype": ["float16"],
"dtype": ["bfloat16"],
"dataset": ["shakespeare_char"],
"use_rotary_embeddings": [false],
"use_abs_pos_embeddings": [true],
"compile": [true],
"compile": [false],
"softmax_variant_attn": ["strongermax"],
"strongermax_strength": ["1.5", "2", "2.719", "3", "4", "5"],
"strongermax_divisor": ["1.0", "10.0", "100.0", "1000.0"],
"use_post_ln": [true, false],
"strongermax_use_xmax" : [true, false],
"strongermax_sum_to_1": [true, false],
"statistic": ["all_stats"],
"graph_type":["all"],
"box_plot_interval": ["1000"],
"box_plot_statistic": ["all"],
"patience": ["1000"]
"strongermax_strength": ["2.719"],
"strongermax_divisor": ["256"],
"strongermax_use_xmax" : [true],
"strongermax_xmax_guess" : ["-50"],
"strongermax_overflow_recompute" : [true],
"softmax_io_logging" : [false],
"create_statistics" : [false],
"plot_statistics" : [false],
"statistic" : ["input_max"],
"strongermax_sum_to_1": [false]
}
]
36 changes: 23 additions & 13 deletions gpt_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ class GPTConfig:
moe_top_k: int = 2
moe_router_scheme: str = "softmax"

# Logging options
softmax_io_logging: bool = False
consmax_beta_gamma_logging: bool = False
plot_statistics: bool = False

# Training options
## Gradient Checkpointing - More memory efficient (can do long contexts), but is slower
use_gradient_checkpointing: bool = False
Expand Down Expand Up @@ -49,10 +54,13 @@ class GPTConfig:
## ConSmax Options
consmax_initial_beta: float = 2.0 # beta adjustment
consmax_initial_gamma: float = 100.0 # denominator adjustment
consmax_use_euler_base: bool = True # use 'e' as base for ConSmax, default
consmax_base: float = 2.0 # base to utilize for ConSmax
consmax_use_euler_base: bool = True # use 'e' as base for ConSmax, default

## SaturatingConSmax special options (otherwise same as ConSmax)
## ConSmaxV2 Special Options
consmax_per_head: bool = True # different beta gamma per head

## SaturatingConSmax Special options (otherwise same as ConSmax)
consmax_saturation: float = 11.0 # for SaturatingConSmax saturation point
consmax_learnable_beta: bool = True
consmax_learnable_gamma: bool = True
Expand All @@ -72,9 +80,11 @@ class GPTConfig:

## Strongermax options
strongermax_strength: float = 2.0 # Softermax with option of 'stronger' (larger integer) bases
strongermax_sum_to_1: bool = False # Softermax with option of 'stronger' (larger integer) bases
strongermax_divisor: float = 1.0 # Softermax with option of 'stronger' (larger integer) bases
strongermax_use_xmax: bool = True # Softermax with option of 'stronger' (larger integer) bases
strongermax_sum_to_1: bool = False
strongermax_divisor: float = 1.0
strongermax_use_xmax: bool = True
strongermax_xmax_guess: float = 1.0
strongermax_overflow_recompute: bool = False

## ExpPolymax options
exppolymax_use_euler_base: bool = True
Expand Down Expand Up @@ -137,7 +147,7 @@ class GPTConfig:
linear_std_init: float= 0.02

# Quantizations

## Embedding Quantizations
quantize_wte: bool = False
quantize_wpe: bool = False
Expand Down Expand Up @@ -200,13 +210,13 @@ def from_json(cls, filename: str):
try:
with open(filename, 'r') as json_file:
config_dict = json.load(json_file)

# Get all field names of the dataclass
field_names = {f.name for f in fields(cls)}

# Filter the loaded dict to only include valid fields
filtered_dict = {k: v for k, v in config_dict.items() if k in field_names}

# Create and return a new instance
return cls(**filtered_dict)
except FileNotFoundError:
Expand All @@ -218,14 +228,14 @@ def from_json(cls, filename: str):
except TypeError as e:
print(f"Error: Invalid data in JSON file. {str(e)}")
return None

def to_json(self, filename: str):
"""
Function to save a GPTConfig object as json to be used for later model creation
input:
input:
- fout: string = filename of saved config file
"""
conf_dict = asdict(self)

Expand Down
8 changes: 8 additions & 0 deletions huggingface_model/gpt_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("gpt2-custom")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-custom")

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
output = generator("Once upon a time", max_length=50, num_return_sequences=1)
print(output)
13 changes: 2 additions & 11 deletions huggingface_model/gpt_train.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import gpt_model
from transformers import pipeline
from torch.utils.data import DataLoader

# Load the Wikitext dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
Expand Down Expand Up @@ -51,11 +49,4 @@ def tokenize_function(examples):
trainer.train()

gpt_model.pretrained_model.save_pretrained("gpt2-custom")
gpt_model.tokenizer.save_pretrained("gpt2-custom")

# model = AutoModelForCausalLM.from_pretrained("gpt2-custom")
# tokenizer = AutoTokenizer.from_pretrained("gpt2-custom")

generator = pipeline("text-generation", model=gpt_model.pretrained_model, tokenizer=gpt_model.tokenizer)
output = generator("Once upon a time", max_length=50, num_return_sequences=1)
print(output)
gpt_model.tokenizer.save_pretrained("gpt2-custom")
8 changes: 8 additions & 0 deletions huggingface_model/upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from transformers import AutoTokenizer, AutoConfig, AutoModel

config = AutoConfig.from_pretrained("gpt2-custom")
config.push_to_hub("custom_gpt2")
model = AutoModel.from_pretrained("gpt2-custom")
model.push_to_hub("custom_gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2-custom")
tokenizer.push_to_hub("custom_gpt2")
4 changes: 2 additions & 2 deletions legacy_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def forward(self, x):
return x

@dataclass
class kGPTConfig:
class GPTConfig:
block_size: int = 1024
vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: int = 12
Expand All @@ -129,7 +129,7 @@ class kGPTConfig:
dropout: float = 0.0
bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster

class kGPT(nn.Module):
class GPT(nn.Module):

def __init__(self, config):
super().__init__()
Expand Down
35 changes: 5 additions & 30 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,38 +643,13 @@ def crop_block_size(self, block_size):
block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]

@classmethod
def from_pretrained(cls, model_type, override_args=None):
def from_pretrained(cls, config, model_type):
# assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
override_args = override_args or {} # default to empty dict
# only dropout can be overridden see more notes below
assert all(k == 'dropout' for k in override_args)
from transformers import GPT2LMHeadModel, AutoModelForCausalLM
from transformers import GPT2LMHeadModel

print(f"loading weights from pretrained gpt: {model_type}")

# n_layer, n_head and n_embd are determined from model_type
config_args = {
'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
}[model_type]
print("forcing vocab_size=50257, block_size=1024, bias=True")
config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
config_args['bias'] = True # always True for GPT model checkpoints
config_args['window_size'] = 128 # always None for GPT model checkpoints
# we can override the dropout rate, if desired
if 'dropout' in override_args:
print(f"overriding dropout rate to {override_args['dropout']}")
config_args['dropout'] = override_args['dropout']

# create a from-scratch initialized minGPT model
# TODO: pass more cmd line flags like "softmax" variant into this from_pretrained
config = GPTConfig(**config_args)

# overriding our custom GPTConf presets of "rmsnorm" to "layernorm" for compatibility
config.norm_variant_attn = "layernorm"
config.norm_variant_output = "layernorm"

model = GPT(config)
model_hf = GPT2LMHeadModel.from_pretrained(model_type)

Expand Down Expand Up @@ -703,7 +678,7 @@ def from_pretrained(cls, model_type, override_args=None):
sd[key].copy_(sd_hf[key].t())
elif key.endswith('attn.c_attn.weight') or key.endswith('attn.c_attn.bias'):
# split into c_attn_q/k/v
q, k, v = sd_hf[key].split(config.n_embd, dim=-1)
q, k, v = sd_hf[key].t().split(config.n_embd, dim=0)
q_key_str = key.replace("c_attn", "c_attn_q")
k_key_str = key.replace("c_attn", "c_attn_k")
v_key_str = key.replace("c_attn", "c_attn_v")
Expand Down
38 changes: 38 additions & 0 deletions model_info_util/model_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import torch
from torchinfo import summary
from rich import print
from rich.console import Console
from rich.text import Text
import io

console = Console()

def print_summary(model):
block_header = Text(f"High Level Parameters:", style="bold underline purple")
console.print(block_header)
summary(model)

def print_model_blocks(model, block_range=1):
for idx, block in enumerate(model.transformer.h):
block_header = Text(f"Summary for Block {idx + 1}:", style="bold underline green")
console.print(block_header)
summary(block)
if (idx + 1) == block_range:
break

def print_module_structure(module):
console.print("-" * 50, style="dim")
for name, submodule in module.named_children():
console.print(f'{name}: {submodule}', style="yellow")
console.print("-" * 50, style="dim")

def print_model_tree(model, indent="", print_params=False):
for name, module in model.named_children():
print(indent + name + ": " + str(module.__class__.__name__))
if isinstance(module, torch.nn.Module):
# Print parameters for the next level only
if print_params:
for param_name, _ in module.named_parameters():
print(indent + " " + param_name)
else: # Recursively print submodules without parameters
print_model_tree(module, indent + " ")
4 changes: 2 additions & 2 deletions run_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@

def parse_args():
parser = argparse.ArgumentParser(description="Run experiments based on a json configuration file.")
parser.add_argument("--config", type=str, required=True, help="Path to the configuration JSON file.")
parser.add_argument("--output_dir", type=str, default="out", help="Directory to place the set of output checkpoints.")
parser.add_argument('-c', "--config", type=str, required=True, help="Path to the configuration JSON file.")
parser.add_argument('-o', "--output_dir", type=str, default="out", help="Directory to place the set of output checkpoints.")
parser.add_argument("--csv_ckpt_dir", type=str, default="", help="Directory to place the set of csv checkpoints in csv_logs.")
parser.add_argument("--prefix", type=str, default='', help="Optional prefix for tensorboard_run_name and out_dir.")
parser.add_argument("--add_names", action="store_true", help="Include names of values of the configuration parameters in addition to values (may cause too long a file name).")
Expand Down
28 changes: 21 additions & 7 deletions sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,13 @@
from collections import OrderedDict

from model import GPT, GPTConfig

from model_info_util.model_info import print_summary, print_module_structure, print_model_blocks
from variations.model_variations import model_variation_dictionary

def parse_args():
parser = argparse.ArgumentParser(description="Inference from trained models")
parser.add_argument("--device", type=str, required=True, help="Device to run inference (e.g., 'cpu', 'cuda', 'cuda:0', 'cuda:1')")
parser.add_argument("--out_dir", type=str, required=True, help="Directory to load checkpoint from")
parser.add_argument("--device", type=str, default="cuda", help="Device to run inference (e.g., 'cpu', 'cuda', 'cuda:0', 'cuda:1')")
parser.add_argument("--out_dir", type=str, default="out", help="Directory to load checkpoint from")
parser.add_argument("--quantization_data_file", type=str, default=None, help="File name to export the quantized weights/activations, scale factor, and zero point")
parser.add_argument("--init_from", type=str, default="resume", help="Either 'resume' (from an out_dir) or a GPT-2 variant (e.g., 'gpt2-xl')")
parser.add_argument("--start", type=str, default="\n", help="Start text for generation. Can specify a file using 'FILE:prompt.txt'")
Expand All @@ -41,6 +42,7 @@ def parse_args():
parser.add_argument('--sym_rot_num_angles', type=int, default=None, help="Number of angles for symmetrical rotary embedding")
parser.add_argument('--rope_length', type=int, default=None, help="Number of embeddings to rotate (must be an even number <= total embedding size)")
parser.add_argument('--token_boundary', type=str, default=None, help="optional separator between emitted tokens")
parser.add_argument('--print_model_info', default=True, action=argparse.BooleanOptionalAction, help="print info about model before infernece")

return parser.parse_args()

Expand Down Expand Up @@ -92,16 +94,16 @@ def save_args(args, out_dir):
json.dump(vars(args), f, indent=4)


#TODO: Rename to reflect general purpose
def save_quantized_data(state_dict, out_file):
to_save = OrderedDict()
for k, v in list(state_dict.items()):
if "mlp_act" in k or "attn_act" in k or k.endswith("quantized_bias") or k.endswith("bias_norm") or k.endswith("zero_point") or k.endswith("quantized_weight") or k.endswith("weight_norm"):
to_save[k] = v.cpu().numpy()
# if "mlp_act" in k or "attn_act" in k or k.endswith("quantized_bias") or k.endswith("bias_norm") or k.endswith("zero_point") or k.endswith("quantized_weight") or k.endswith("weight_norm"):
to_save[k] = v.cpu().numpy()

with open(f"{out_file}.pkl", 'wb') as f:
pickle.dump(to_save, f)


def main():
args = parse_args()

Expand Down Expand Up @@ -135,10 +137,22 @@ def main():

model.load_state_dict(state_dict, strict=False)
else:
model = GPT.from_pretrained(args.init_from, dict(dropout=0.0))
# need to create a completely "default" GPTConfig and overwrite using model_variations
gptconf = GPTConfig()
variation_dict = model_variation_dictionary[args.init_from]
for k in variation_dict:
gptconf[k] = variation_dict[k]
model = GPT.from_pretrained(gptconf, model_type=args.init_from)

model.eval()
model.to(args.device)

# Print the model summary
if args.print_model_info:
print_summary(model)
print_model_blocks(model)
print_module_structure(model)

if args.compile:
model = torch.compile(model)

Expand Down
Loading

0 comments on commit 858bc2f

Please sign in to comment.