Enable code formatting (#40)

Adds a GitHub workflow so its called when commits pushed. Also, changes for running the formatter locally are committed so the workflow will run as expected. Signed-off-by: Martin Hickey <martin.hickey@ie.ibm.com>
foundation-model-stack · Feb 13, 2024 · cef30ea · cef30ea
1 parent ac597d8
commit cef30ea
Show file tree

Hide file tree

Showing 10 changed files with 286 additions and 116 deletions.
diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
@@ -0,0 +1,38 @@
+# Copyright The Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Format
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.9
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install -r setup_requirements.txt
+      - name: Check Formatting
+        run: tox -e fmt
+
diff --git a/scripts/run_inference.py b/scripts/run_inference.py
@@ -8,13 +8,16 @@
 
 If these things change in the future, we should consider breaking it up.
 """
+# Standard
 import argparse
 import json
 import os
+
+# Third Party
 from peft import AutoPeftModelForCausalLM
-import torch
 from tqdm import tqdm
 from transformers import AutoTokenizer
+import torch
 
 
 ### Utilities
@@ -30,10 +33,13 @@ class AdapterConfigPatcher:
             # When loaded in this block, the config's base_model_name_or_path is "foo"
             peft_model = AutoPeftModelForCausalLM.from_pretrained(checkpoint_path)
     """
+
     def __init__(self, checkpoint_path: str, overrides: dict):
         self.checkpoint_path = checkpoint_path
         self.overrides = overrides
-        self.config_path = AdapterConfigPatcher._locate_adapter_config(self.checkpoint_path)
+        self.config_path = AdapterConfigPatcher._locate_adapter_config(
+            self.checkpoint_path
+        )
         # Values that we will patch later on
         self.patched_values = {}
 
@@ -58,7 +64,7 @@ def _locate_adapter_config(checkpoint_path: str) -> str:
     def _apply_config_changes(self, overrides: dict) -> dict:
         """Applies a patch to a config with some override dict, returning the values
         that we patched over so that they may be restored later.
-        
+
         Args:
             overrides: dict
                 Overrides to write into the adapter_config.json. Currently, we
@@ -99,7 +105,9 @@ def _get_old_config_values(adapter_config: dict, overrides: dict) -> dict:
         # For now, we only expect to patch the base model; this may change in the future,
         # but ensure that anything we are patching is defined in the original config
         if not set(overrides.keys()).issubset(set(adapter_config.keys())):
-            raise KeyError("Adapter config overrides must be set in the config being patched")
+            raise KeyError(
+                "Adapter config overrides must be set in the config being patched"
+            )
         return {key: adapter_config[key] for key in overrides}
 
     def __enter__(self):
@@ -119,7 +127,9 @@ def __init__(self, model, tokenizer, device):
         self.device = device
 
     @classmethod
-    def load(cls, checkpoint_path: str, base_model_name_or_path: str=None) -> "TunedCausalLM":
+    def load(
+        cls, checkpoint_path: str, base_model_name_or_path: str = None
+    ) -> "TunedCausalLM":
         """Loads an instance of this model.
 
         Args:
@@ -138,7 +148,11 @@ def load(cls, checkpoint_path: str, base_model_name_or_path: str=None) -> "Tuned
             TunedCausalLM
                 An instance of this class on which we can run inference.
         """
-        overrides = {"base_model_name_or_path": base_model_name_or_path} if base_model_name_or_path is not None else {}
+        overrides = (
+            {"base_model_name_or_path": base_model_name_or_path}
+            if base_model_name_or_path is not None
+            else {}
+        )
         tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
         # Apply the configs to the adapter config of this model; if no overrides
         # are provided, then the context manager doesn't have any effect.
@@ -153,7 +167,6 @@ def load(cls, checkpoint_path: str, base_model_name_or_path: str=None) -> "Tuned
         peft_model.to(device)
         return cls(peft_model, tokenizer, device)
 
-
     def run(self, text: str, *, max_new_tokens: int) -> str:
         """Runs inference on an instance of this model.
 
@@ -165,13 +178,17 @@ def run(self, text: str, *, max_new_tokens: int) -> str:
 
         Returns:
             str
-                Text generation result.          
+                Text generation result.
         """
         tok_res = self.tokenizer(text, return_tensors="pt")
         input_ids = tok_res.input_ids.to(self.device)
 
-        peft_outputs = self.peft_model.generate(input_ids=input_ids, max_new_tokens=max_new_tokens)
-        decoded_result = self.tokenizer.batch_decode(peft_outputs, skip_special_tokens=False)[0]
+        peft_outputs = self.peft_model.generate(
+            input_ids=input_ids, max_new_tokens=max_new_tokens
+        )
+        decoded_result = self.tokenizer.batch_decode(
+            peft_outputs, skip_special_tokens=False
+        )[0]
         return decoded_result
 
 
@@ -180,7 +197,9 @@ def main():
     parser = argparse.ArgumentParser(
         description="Loads a tuned model and runs an inference call(s) through it"
     )
-    parser.add_argument("--model", help="Path to tuned model to be loaded", required=True)
+    parser.add_argument(
+        "--model", help="Path to tuned model to be loaded", required=True
+    )
     parser.add_argument(
         "--out_file",
         help="JSON file to write results to",
@@ -189,7 +208,7 @@ def main():
     parser.add_argument(
         "--base_model_name_or_path",
         help="Override for base model to be used [default: value in model adapter_config.json]",
-        default=None
+        default=None,
     )
     parser.add_argument(
         "--max_new_tokens",
@@ -199,7 +218,10 @@ def main():
     )
     group = parser.add_mutually_exclusive_group(required=True)
     group.add_argument("--text", help="Text to run inference on")
-    group.add_argument("--text_file", help="File to be processed where each line is a text to run inference on")
+    group.add_argument(
+        "--text_file",
+        help="File to be processed where each line is a text to run inference on",
+    )
     args = parser.parse_args()
     # If we passed a file, check if it exists before doing anything else
     if args.text_file and not os.path.isfile(args.text_file):
@@ -220,7 +242,10 @@ def main():
 
     # TODO: we should add batch inference support
     results = [
-        {"input": text, "output": loaded_model.run(text, max_new_tokens=args.max_new_tokens)}
+        {
+            "input": text,
+            "output": loaded_model.run(text, max_new_tokens=args.max_new_tokens),
+        }
         for text in tqdm(texts)
     ]
 
@@ -230,5 +255,6 @@ def main():
 
     print(f"Exported results to: {args.out_file}")
 
+
 if __name__ == "__main__":
     main()
diff --git a/setup.py b/setup.py
@@ -1,7 +1,4 @@
+# Third Party
 from setuptools import find_packages, setup
 
-setup(
-    name="tuning",
-    version="0.0.1",
-    packages=find_packages()
-)
+setup(name="tuning", version="0.0.1", packages=find_packages())
diff --git a/tuning/aim_loader.py b/tuning/aim_loader.py
@@ -1,16 +1,22 @@
+# Standard
 import os
+
+# Third Party
 from aim.hugging_face import AimCallback
 
+
 def get_aimstack_callback():
     # Initialize a new run
-    aim_server = os.environ.get('AIMSTACK_SERVER')
-    aim_db = os.environ.get('AIMSTACK_DB')
-    aim_experiment = os.environ.get('AIMSTACK_EXPERIMENT')
+    aim_server = os.environ.get("AIMSTACK_SERVER")
+    aim_db = os.environ.get("AIMSTACK_DB")
+    aim_experiment = os.environ.get("AIMSTACK_EXPERIMENT")
     if aim_experiment is None:
         aim_experiment = ""
 
     if aim_server:
-        aim_callback = AimCallback(repo='aim://'+aim_server+'/', experiment=aim_experiment)
+        aim_callback = AimCallback(
+            repo="aim://" + aim_server + "/", experiment=aim_experiment
+        )
     if aim_db:
         aim_callback = AimCallback(repo=aim_db, experiment=aim_experiment)
     else:

diff --git a/tuning/config/configs.py b/tuning/config/configs.py
@@ -1,32 +1,46 @@
+# Standard
 from dataclasses import dataclass, field
 from typing import Dict, Optional, Union
+
+# Third Party
 import torch
 import transformers
 
-DEFAULT_CONTEXT_LENGTH=4096
-DEFAULT_OPTIMIZER="adamw_torch"
+DEFAULT_CONTEXT_LENGTH = 4096
+DEFAULT_OPTIMIZER = "adamw_torch"
 
 IGNORE_INDEX = -100
 DEFAULT_PAD_TOKEN = "<PAD>"
 DEFAULT_EOS_TOKEN = "</s>"
 DEFAULT_BOS_TOKEN = "<s>"
 DEFAULT_UNK_TOKEN = "<unk>"
 
+
 @dataclass
 class ModelArguments:
     model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
     use_flash_attn: bool = field(
         default=True,
-        metadata={"help": "Use Flash attention v2 from transformers, default is True"}
+        metadata={"help": "Use Flash attention v2 from transformers, default is True"},
     )
-    torch_dtype: Optional[Union[torch.dtype , str]] = torch.bfloat16
+    torch_dtype: Optional[Union[torch.dtype, str]] = torch.bfloat16
+
 
 @dataclass
 class DataArguments:
-    data_path: str = field(default=None, metadata={"help": "Path to the training data in JSONL format."})
-    response_template: str = field(default=None, metadata={"help": "Response template, separator to train on completions only"})
-    dataset_text_field: str = field(default=None, metadata={"help": "Training dataset text field"})
-    validation_data_path: str = field(default=None, metadata={"help": "Path to the validation data in JSONL format."})
+    data_path: str = field(
+        default=None, metadata={"help": "Path to the training data in JSONL format."}
+    )
+    response_template: str = field(
+        default=None,
+        metadata={"help": "Response template, separator to train on completions only"},
+    )
+    dataset_text_field: str = field(
+        default=None, metadata={"help": "Training dataset text field"}
+    )
+    validation_data_path: str = field(
+        default=None, metadata={"help": "Path to the validation data in JSONL format."}
+    )
 
 
 @dataclass
@@ -35,7 +49,9 @@ class TrainingArguments(transformers.TrainingArguments):
     # optim: str = field(default=DEFAULT_OPTIMIZER)
     model_max_length: int = field(
         default=DEFAULT_CONTEXT_LENGTH,
-        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
+        metadata={
+            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
     )
     packing: bool = field(
         default=False,

diff --git a/tuning/config/peft_config.py b/tuning/config/peft_config.py
@@ -1,15 +1,20 @@
+# Standard
 from dataclasses import dataclass, field
 from typing import List
 
+
 @dataclass
 class LoraConfig:
     r: int = 8
     lora_alpha: int = 32
-    target_modules: List[str] = field(default_factory=lambda: ["q_proj", "v_proj"], metadata={
-        "help": "The names of the modules to apply LORA to. LORA selects modules which either completely match or "
-        "end with one of the strings. If the value is [\"all-linear\"], then LORA selects all linear and Conv1D "
-        "modules except for the output layer."
-    })
+    target_modules: List[str] = field(
+        default_factory=lambda: ["q_proj", "v_proj"],
+        metadata={
+            "help": "The names of the modules to apply LORA to. LORA selects modules which either completely match or "
+            'end with one of the strings. If the value is ["all-linear"], then LORA selects all linear and Conv1D '
+            "modules except for the output layer."
+        },
+    )
     bias = "none"
     lora_dropout: float = 0.05
 
@@ -19,4 +24,4 @@ class PromptTuningConfig:
     prompt_tuning_init: str = "TEXT"
     num_virtual_tokens: int = 8
     prompt_tuning_init_text: str = "Classify if the tweet is a complaint or not:"
-    tokenizer_name_or_path: str = "llama-7b-hf"
+    tokenizer_name_or_path: str = "llama-7b-hf"
diff --git a/tuning/data/tokenizer_data_utils.py b/tuning/data/tokenizer_data_utils.py
@@ -1,20 +1,25 @@
-import transformers
+# Standard
 from typing import Dict, Sequence
 import copy
-from tuning.config import configs
+import json
+import logging
 
+# Third Party
 from torch.utils.data import Dataset
-import logging
-import json
 import torch
+import transformers
+
+# Local
+from tuning.config import configs
+
 
 def tokenizer_and_embedding_resize(
     special_tokens_dict: Dict,
     tokenizer: transformers.PreTrainedTokenizer,
     model: transformers.PreTrainedModel,
 ):
     """Resize tokenizer and embedding.
-    
+
     TODO: In the future, make sure we can have vocab size divisible by 64.
     """
     num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
@@ -24,8 +29,12 @@ def tokenizer_and_embedding_resize(
         input_embeddings = model.get_input_embeddings().weight.data
         output_embeddings = model.get_output_embeddings().weight.data
 
-        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
-        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True
+        )
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True
+        )
 
         input_embeddings[-num_new_tokens:] = input_embeddings_avg
         output_embeddings[-num_new_tokens:] = output_embeddings_avg