Merge branch 'master' into add_training_estimates

gkielian · Aug 24, 2024 · 858bc2f · 858bc2f
2 parents 7487449 + a65c5c3
commit 858bc2f
Show file tree

Hide file tree

Showing 19 changed files with 670 additions and 176 deletions.
diff --git a/explorations/consmax_v2_sweep.json b/explorations/consmax_v2_sweep.json
@@ -0,0 +1,28 @@
+[
+  {
+    "max_iters": ["3500"],
+    "n_layer": ["12"],
+    "n_head": ["6"],
+    "n_embd": ["384"],
+    "block_size":["256"],
+    "device": ["cuda"],
+    "dtype": ["bfloat16"],
+    "dataset": ["shakespeare_char"],
+    "use_gradient_checkpointing": [true],
+    "use_rotary_embeddings": [false],
+    "use_abs_pos_embeddings": [true],
+    "compile": [false],
+    "softmax_variant_attn": ["consmax_v2"],
+    "consmax_initial_beta": ["2.5"],
+    "consmax_initial_gamma": ["100.0"],
+    "softmax_io_logging": [true],
+    "create_statistics": [true],
+    "plot_statistics": [true],
+    "consmax_per_head": {
+      "conditions": [["softmax_variant_attn", "consmax_v2"]],
+      "options": [true, false]
+    },
+    "use_post_ln": [true, false]
+  }
+]
+
diff --git a/explorations/strongermax_sweep.json b/explorations/strongermax_sweep.json
@@ -1,27 +1,27 @@
 [
   {
-    "max_iters": ["6000"],
+    "max_iters": ["3500"],
     "n_layer": ["6"],
     "n_kv_group": ["6"],
     "n_head": ["6"],
     "n_embd": ["384"],
     "block_size":["256"],
     "device": ["cuda"],
-    "dtype": ["float16"],
+    "dtype": ["bfloat16"],
     "dataset": ["shakespeare_char"],
     "use_rotary_embeddings": [false],
     "use_abs_pos_embeddings": [true],
-    "compile": [true],
+    "compile": [false],
     "softmax_variant_attn": ["strongermax"],
-    "strongermax_strength": ["1.5", "2", "2.719", "3", "4", "5"],
-    "strongermax_divisor": ["1.0", "10.0", "100.0", "1000.0"],
-    "use_post_ln": [true, false],
-    "strongermax_use_xmax" : [true, false],
-    "strongermax_sum_to_1": [true, false],
-    "statistic": ["all_stats"],
-    "graph_type":["all"],
-    "box_plot_interval": ["1000"],
-    "box_plot_statistic": ["all"],
-    "patience": ["1000"]
+    "strongermax_strength": ["2.719"],
+    "strongermax_divisor": ["256"],
+    "strongermax_use_xmax" : [true],
+    "strongermax_xmax_guess" : ["-50"],
+    "strongermax_overflow_recompute" : [true],
+    "softmax_io_logging" : [false],
+    "create_statistics" : [false],
+    "plot_statistics" : [false],
+    "statistic" : ["input_max"],
+    "strongermax_sum_to_1": [false]
   }
 ]
diff --git a/gpt_conf.py b/gpt_conf.py
@@ -18,6 +18,11 @@ class GPTConfig:
     moe_top_k: int = 2
     moe_router_scheme: str = "softmax"
 
+    # Logging options
+    softmax_io_logging: bool = False
+    consmax_beta_gamma_logging: bool = False
+    plot_statistics: bool = False
+
     # Training options
     ## Gradient Checkpointing - More memory efficient (can do long contexts), but is slower
     use_gradient_checkpointing: bool = False
@@ -49,10 +54,13 @@ class GPTConfig:
     ## ConSmax Options
     consmax_initial_beta: float = 2.0 # beta adjustment
     consmax_initial_gamma: float = 100.0 # denominator adjustment
-    consmax_use_euler_base: bool = True # use 'e' as base for ConSmax, default
     consmax_base: float = 2.0 # base to utilize for ConSmax
+    consmax_use_euler_base: bool = True # use 'e' as base for ConSmax, default
 
-    ## SaturatingConSmax special options (otherwise same as ConSmax)
+    ## ConSmaxV2 Special Options
+    consmax_per_head: bool = True # different beta gamma per head
+
+    ## SaturatingConSmax Special options (otherwise same as ConSmax)
     consmax_saturation: float = 11.0 # for SaturatingConSmax saturation point
     consmax_learnable_beta: bool = True
     consmax_learnable_gamma: bool = True
@@ -72,9 +80,11 @@ class GPTConfig:
 
     ## Strongermax options
     strongermax_strength: float = 2.0 # Softermax with option of 'stronger' (larger integer) bases
-    strongermax_sum_to_1: bool = False # Softermax with option of 'stronger' (larger integer) bases
-    strongermax_divisor: float = 1.0 # Softermax with option of 'stronger' (larger integer) bases
-    strongermax_use_xmax: bool = True # Softermax with option of 'stronger' (larger integer) bases
+    strongermax_sum_to_1: bool = False
+    strongermax_divisor: float = 1.0
+    strongermax_use_xmax: bool = True
+    strongermax_xmax_guess: float = 1.0
+    strongermax_overflow_recompute: bool = False
 
     ## ExpPolymax options
     exppolymax_use_euler_base: bool = True
@@ -137,7 +147,7 @@ class GPTConfig:
     linear_std_init: float= 0.02
 
     # Quantizations
-    
+
     ## Embedding Quantizations
     quantize_wte: bool = False
     quantize_wpe: bool = False
@@ -200,13 +210,13 @@ def from_json(cls, filename: str):
         try:
             with open(filename, 'r') as json_file:
                 config_dict = json.load(json_file)
-            
+
             # Get all field names of the dataclass
             field_names = {f.name for f in fields(cls)}
-            
+
             # Filter the loaded dict to only include valid fields
             filtered_dict = {k: v for k, v in config_dict.items() if k in field_names}
-            
+
             # Create and return a new instance
             return cls(**filtered_dict)
         except FileNotFoundError:
@@ -218,14 +228,14 @@ def from_json(cls, filename: str):
         except TypeError as e:
             print(f"Error: Invalid data in JSON file. {str(e)}")
             return None
-    
+
     def to_json(self, filename: str):
         """
         Function to save a GPTConfig object as json to be used for later model creation
-        
-        input: 
+
+        input:
         - fout: string = filename of saved config file
-        
+
         """
         conf_dict = asdict(self)
 

diff --git a/huggingface_model/gpt_sample.py b/huggingface_model/gpt_sample.py
@@ -0,0 +1,8 @@
+from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
+
+model = GPT2LMHeadModel.from_pretrained("gpt2-custom")
+tokenizer = GPT2Tokenizer.from_pretrained("gpt2-custom")
+
+generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+output = generator("Once upon a time", max_length=50, num_return_sequences=1)
+print(output)
diff --git a/huggingface_model/gpt_train.py b/huggingface_model/gpt_train.py
@@ -1,8 +1,6 @@
-from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer
+from transformers import Trainer, TrainingArguments
 from datasets import load_dataset
 import gpt_model
-from transformers import pipeline
-from torch.utils.data import DataLoader
 
 # Load the Wikitext dataset
 dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
@@ -51,11 +49,4 @@ def tokenize_function(examples):
 trainer.train()
 
 gpt_model.pretrained_model.save_pretrained("gpt2-custom")
-gpt_model.tokenizer.save_pretrained("gpt2-custom")
-
-# model = AutoModelForCausalLM.from_pretrained("gpt2-custom")
-# tokenizer = AutoTokenizer.from_pretrained("gpt2-custom")
-
-generator = pipeline("text-generation", model=gpt_model.pretrained_model, tokenizer=gpt_model.tokenizer)
-output = generator("Once upon a time", max_length=50, num_return_sequences=1)
-print(output)
+gpt_model.tokenizer.save_pretrained("gpt2-custom")
diff --git a/huggingface_model/upload.py b/huggingface_model/upload.py
@@ -0,0 +1,8 @@
+from transformers import AutoTokenizer, AutoConfig, AutoModel
+
+config = AutoConfig.from_pretrained("gpt2-custom")
+config.push_to_hub("custom_gpt2")
+model = AutoModel.from_pretrained("gpt2-custom")
+model.push_to_hub("custom_gpt2")
+tokenizer = AutoTokenizer.from_pretrained("gpt2-custom")
+tokenizer.push_to_hub("custom_gpt2")
diff --git a/legacy_model.py b/legacy_model.py
@@ -120,7 +120,7 @@ def forward(self, x):
         return x
 
 @dataclass
-class kGPTConfig:
+class GPTConfig:
     block_size: int = 1024
     vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
     n_layer: int = 12
@@ -129,7 +129,7 @@ class kGPTConfig:
     dropout: float = 0.0
     bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
 
-class kGPT(nn.Module):
+class GPT(nn.Module):
 
     def __init__(self, config):
         super().__init__()

diff --git a/model.py b/model.py
@@ -643,38 +643,13 @@ def crop_block_size(self, block_size):
                 block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
 
     @classmethod
-    def from_pretrained(cls, model_type, override_args=None):
+    def from_pretrained(cls, config, model_type):
         # assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
-        override_args = override_args or {} # default to empty dict
-        # only dropout can be overridden see more notes below
-        assert all(k == 'dropout' for k in override_args)
-        from transformers import GPT2LMHeadModel, AutoModelForCausalLM
+        from transformers import GPT2LMHeadModel
+
         print(f"loading weights from pretrained gpt: {model_type}")
-
-        # n_layer, n_head and n_embd are determined from model_type
-        config_args = {
-            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
-            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
-            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
-            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
-        }[model_type]
-        print("forcing vocab_size=50257, block_size=1024, bias=True")
-        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
-        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
-        config_args['bias'] = True # always True for GPT model checkpoints
-        config_args['window_size'] = 128 # always None for GPT model checkpoints
-        # we can override the dropout rate, if desired
-        if 'dropout' in override_args:
-            print(f"overriding dropout rate to {override_args['dropout']}")
-            config_args['dropout'] = override_args['dropout']
+
         # create a from-scratch initialized minGPT model
-        # TODO: pass more cmd line flags like "softmax" variant into this from_pretrained 
-        config = GPTConfig(**config_args)
-
-        # overriding our custom GPTConf presets of "rmsnorm" to "layernorm" for compatibility
-        config.norm_variant_attn = "layernorm"
-        config.norm_variant_output = "layernorm"
-
         model = GPT(config)
         model_hf = GPT2LMHeadModel.from_pretrained(model_type)
 
@@ -703,7 +678,7 @@ def from_pretrained(cls, model_type, override_args=None):
                     sd[key].copy_(sd_hf[key].t())
             elif key.endswith('attn.c_attn.weight') or key.endswith('attn.c_attn.bias'):
                 # split into c_attn_q/k/v
-                q, k, v  = sd_hf[key].split(config.n_embd, dim=-1)
+                q, k, v  = sd_hf[key].t().split(config.n_embd, dim=0)
                 q_key_str = key.replace("c_attn", "c_attn_q")
                 k_key_str = key.replace("c_attn", "c_attn_k")
                 v_key_str = key.replace("c_attn", "c_attn_v")

diff --git a/model_info_util/model_info.py b/model_info_util/model_info.py
@@ -0,0 +1,38 @@
+import torch
+from torchinfo import summary
+from rich import print
+from rich.console import Console
+from rich.text import Text
+import io
+
+console = Console()
+
+def print_summary(model):
+    block_header = Text(f"High Level Parameters:", style="bold underline purple")
+    console.print(block_header)
+    summary(model)
+
+def print_model_blocks(model, block_range=1):
+    for idx, block in enumerate(model.transformer.h):
+        block_header = Text(f"Summary for Block {idx + 1}:", style="bold underline green")
+        console.print(block_header)
+        summary(block)
+        if (idx + 1) == block_range:
+            break
+
+def print_module_structure(module):
+    console.print("-" * 50, style="dim")
+    for name, submodule in module.named_children():
+        console.print(f'{name}: {submodule}', style="yellow")
+    console.print("-" * 50, style="dim")
+
+def print_model_tree(model, indent="", print_params=False):
+    for name, module in model.named_children():
+        print(indent + name + ": " + str(module.__class__.__name__))
+        if isinstance(module, torch.nn.Module):
+            # Print parameters for the next level only
+            if print_params:
+                for param_name, _ in module.named_parameters():
+                    print(indent + "  " + param_name)
+            else:  # Recursively print submodules without parameters
+                print_model_tree(module, indent + "  ")
diff --git a/run_experiments.py b/run_experiments.py
@@ -12,8 +12,8 @@
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Run experiments based on a json configuration file.")
-    parser.add_argument("--config", type=str, required=True, help="Path to the configuration JSON file.")
-    parser.add_argument("--output_dir", type=str, default="out", help="Directory to place the set of output checkpoints.")
+    parser.add_argument('-c', "--config", type=str, required=True, help="Path to the configuration JSON file.")
+    parser.add_argument('-o', "--output_dir", type=str, default="out", help="Directory to place the set of output checkpoints.")
     parser.add_argument("--csv_ckpt_dir", type=str, default="", help="Directory to place the set of csv checkpoints in csv_logs.")
     parser.add_argument("--prefix", type=str, default='', help="Optional prefix for tensorboard_run_name and out_dir.")
     parser.add_argument("--add_names", action="store_true", help="Include names of values of the configuration parameters in addition to values (may cause too long a file name).")

diff --git a/sample.py b/sample.py
@@ -15,12 +15,13 @@
 from collections import OrderedDict
 
 from model import GPT, GPTConfig
-
+from model_info_util.model_info import print_summary, print_module_structure, print_model_blocks
+from variations.model_variations import model_variation_dictionary
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Inference from trained models")
-    parser.add_argument("--device", type=str, required=True, help="Device to run inference (e.g., 'cpu', 'cuda', 'cuda:0', 'cuda:1')")
-    parser.add_argument("--out_dir", type=str, required=True, help="Directory to load checkpoint from")
+    parser.add_argument("--device", type=str, default="cuda", help="Device to run inference (e.g., 'cpu', 'cuda', 'cuda:0', 'cuda:1')")
+    parser.add_argument("--out_dir", type=str, default="out", help="Directory to load checkpoint from")
     parser.add_argument("--quantization_data_file", type=str, default=None, help="File name to export the quantized weights/activations, scale factor, and zero point")
     parser.add_argument("--init_from", type=str, default="resume", help="Either 'resume' (from an out_dir) or a GPT-2 variant (e.g., 'gpt2-xl')")
     parser.add_argument("--start", type=str, default="\n", help="Start text for generation. Can specify a file using 'FILE:prompt.txt'")
@@ -41,6 +42,7 @@ def parse_args():
     parser.add_argument('--sym_rot_num_angles', type=int, default=None, help="Number of angles for symmetrical rotary embedding")
     parser.add_argument('--rope_length', type=int, default=None, help="Number of embeddings to rotate (must be an even number <= total embedding size)")
     parser.add_argument('--token_boundary', type=str, default=None, help="optional separator between emitted tokens")
+    parser.add_argument('--print_model_info', default=True, action=argparse.BooleanOptionalAction, help="print info about model before infernece")
 
     return parser.parse_args()
 
@@ -92,16 +94,16 @@ def save_args(args, out_dir):
         json.dump(vars(args), f, indent=4)
 
 
+#TODO: Rename to reflect general purpose
 def save_quantized_data(state_dict, out_file):
     to_save = OrderedDict()
     for k, v in list(state_dict.items()):
-        if "mlp_act" in k or "attn_act" in k or k.endswith("quantized_bias") or k.endswith("bias_norm") or k.endswith("zero_point") or k.endswith("quantized_weight") or k.endswith("weight_norm"):
-            to_save[k] = v.cpu().numpy()
+        # if "mlp_act" in k or "attn_act" in k or k.endswith("quantized_bias") or k.endswith("bias_norm") or k.endswith("zero_point") or k.endswith("quantized_weight") or k.endswith("weight_norm"):
+        to_save[k] = v.cpu().numpy()
 
     with open(f"{out_file}.pkl", 'wb') as f:
         pickle.dump(to_save, f)
 
-
 def main():
     args = parse_args()
 
@@ -135,10 +137,22 @@ def main():
 
         model.load_state_dict(state_dict, strict=False)
     else:
-        model = GPT.from_pretrained(args.init_from, dict(dropout=0.0))
+        # need to create a completely "default" GPTConfig and overwrite using model_variations
+        gptconf = GPTConfig()
+        variation_dict = model_variation_dictionary[args.init_from]
+        for k in variation_dict:
+            gptconf[k] = variation_dict[k]
+        model = GPT.from_pretrained(gptconf, model_type=args.init_from)
 
     model.eval()
     model.to(args.device)
+
+    # Print the model summary
+    if args.print_model_info:
+        print_summary(model)
+        print_model_blocks(model)
+        print_module_structure(model)
+
     if args.compile:
         model = torch.compile(model)