Merge branch 'main' into asonawane/mpt

pytorch · Aug 3, 2023 · dc65d8c · dc65d8c
2 parents f116952 + e7ca300
commit dc65d8c
Show file tree

Hide file tree

Showing 14 changed files with 122 additions and 9 deletions.
diff --git a/test.py b/test.py
@@ -20,10 +20,10 @@
 # Some of the models have very heavyweight setup, so we have to set a very
 # generous limit. That said, we don't want the entire test suite to hang if
 # a single test encounters an extreme failure, so we give up after a test is
-# unresponsive to 5 minutes. (Note: this does not require that the entire
-# test case completes in 5 minutes. It requires that if the worker is
+# unresponsive to 5 minutes by default. (Note: this does not require that the
+# entire test case completes in 5 minutes. It requires that if the worker is
 # unresponsive for 5 minutes the parent will presume it dead / incapacitated.)
-TIMEOUT = 300  # Seconds
+TIMEOUT = int(os.getenv("TIMEOUT", 300))  # Seconds
 
 class TestBenchmark(unittest.TestCase):
 
@@ -123,6 +123,8 @@ def _load_tests():
         devices.append('cuda')
     if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
         devices.append('mps')
+    if device := os.getenv('ACCELERATOR'):
+        devices.append(device)
 
     for path in _list_model_paths():
         # TODO: skipping quantized tests for now due to BC-breaking changes for prepare

diff --git a/torchbenchmark/__init__.py b/torchbenchmark/__init__.py
@@ -295,6 +295,8 @@ def _maybe_import_model(package: str, model_path: str) -> Dict[str, Any]:
         diagnostic_msg = ""
         try:
             module = importlib.import_module(f'.models.{model_name}', package=package)
+            if accelerator_backend := os.getenv("ACCELERATOR_BACKEND"):
+                setattr(module, accelerator_backend, importlib.import_module(accelerator_backend))
             Model = getattr(module, 'Model', None)
             if Model is None:
                 diagnostic_msg = f"Warning: {module} does not define attribute Model, skip it"

diff --git a/torchbenchmark/canary_models/llama_v2_13b/__init__.py b/torchbenchmark/canary_models/llama_v2_13b/__init__.py
@@ -0,0 +1,15 @@
+from torchbenchmark.tasks import NLP
+from torchbenchmark.util.framework.huggingface.model_factory import HuggingFaceModel, HuggingFaceAuthMixin
+
+class Model(HuggingFaceModel, HuggingFaceAuthMixin):
+    task = NLP.LANGUAGE_MODELING
+    DEFAULT_TRAIN_BSIZE = 1
+    DEFAULT_EVAL_BSIZE = 1
+    DEEPCOPY = False 
+
+    def __init__(self, test, device, batch_size=None, extra_args=[]):
+        HuggingFaceAuthMixin.__init__(self)
+        super().__init__(name="llama_v2_13b", test=test, device=device, batch_size=batch_size, extra_args=extra_args)
+
+    def train(self):
+        return NotImplementedError("FSDP should implement a training loop")
diff --git a/torchbenchmark/canary_models/llama_v2_13b/install.py b/torchbenchmark/canary_models/llama_v2_13b/install.py
@@ -0,0 +1,9 @@
+
+import subprocess
+import sys
+import os
+from torchbenchmark.util.framework.huggingface.patch_hf import patch_transformers, cache_model
+
+if __name__ == '__main__':
+    model_name = os.path.basename(os.path.dirname(os.path.abspath(__file__)))
+    cache_model(model_name)
diff --git a/torchbenchmark/canary_models/llama_v2_13b/metadata.yaml b/torchbenchmark/canary_models/llama_v2_13b/metadata.yaml
@@ -0,0 +1,12 @@
+devices:
+  NVIDIA A100-SXM4-40GB:
+    eval_batch_size: 1
+eval_benchmark: false
+eval_deterministic: false
+eval_nograd: true
+not_implemented:
+- device: cpu
+- device: cuda
+  test: train
+train_benchmark: false
+train_deterministic: false
diff --git a/torchbenchmark/canary_models/llama_v2_70b/__init__.py b/torchbenchmark/canary_models/llama_v2_70b/__init__.py
@@ -0,0 +1,16 @@
+from torchbenchmark.tasks import NLP
+from torchbenchmark.util.framework.huggingface.model_factory import HuggingFaceModel, HuggingFaceAuthMixin
+
+class Model(HuggingFaceModel, HuggingFaceAuthMixin):
+    task = NLP.LANGUAGE_MODELING
+    DEFAULT_TRAIN_BSIZE = 1
+    DEFAULT_EVAL_BSIZE = 1
+    DEEPCOPY = False 
+
+    def __init__(self, test, device, batch_size=None, extra_args=[]):
+        HuggingFaceAuthMixin.__init__(self)
+        super().__init__(name="llama_v2_70b", test=test, device=device, batch_size=batch_size, extra_args=extra_args)
+
+
+    def train(self):
+        return NotImplementedError("FSDP should implement a training loop")
diff --git a/torchbenchmark/canary_models/llama_v2_70b/install.py b/torchbenchmark/canary_models/llama_v2_70b/install.py
@@ -0,0 +1,9 @@
+
+import subprocess
+import sys
+import os
+from torchbenchmark.util.framework.huggingface.patch_hf import patch_transformers, cache_model
+
+if __name__ == '__main__':
+    model_name = os.path.basename(os.path.dirname(os.path.abspath(__file__)))
+    cache_model(model_name)
diff --git a/torchbenchmark/canary_models/llama_v2_70b/metadata.yaml b/torchbenchmark/canary_models/llama_v2_70b/metadata.yaml
@@ -0,0 +1,12 @@
+devices:
+  NVIDIA A100-SXM4-40GB:
+    eval_batch_size: 1
+eval_benchmark: false
+eval_deterministic: false
+eval_nograd: true
+not_implemented:
+- device: cpu
+- device: cuda
+  test: train
+train_benchmark: false
+train_deterministic: false
diff --git a/torchbenchmark/canary_models/llama_v2_7b/__init__.py b/torchbenchmark/canary_models/llama_v2_7b/__init__.py
@@ -0,0 +1,16 @@
+from torchbenchmark.tasks import NLP
+from torchbenchmark.util.framework.huggingface.model_factory import HuggingFaceModel, HuggingFaceAuthMixin
+
+class Model(HuggingFaceModel, HuggingFaceAuthMixin):
+    task = NLP.LANGUAGE_MODELING
+    DEFAULT_TRAIN_BSIZE = 1
+    DEFAULT_EVAL_BSIZE = 1
+    DEEPCOPY = False 
+
+    def __init__(self, test, device, batch_size=None, extra_args=[]):
+        HuggingFaceAuthMixin.__init__(self)
+        super().__init__(name="llama_v2_7b", test=test, device=device, batch_size=batch_size, extra_args=extra_args)
+
+
+    def train(self):
+        return NotImplementedError("FSDP should implement a training loop")
diff --git a/torchbenchmark/canary_models/llama_v2_7b/install.py b/torchbenchmark/canary_models/llama_v2_7b/install.py
@@ -0,0 +1,9 @@
+
+import subprocess
+import sys
+import os
+from torchbenchmark.util.framework.huggingface.patch_hf import patch_transformers, cache_model
+
+if __name__ == '__main__':
+    model_name = os.path.basename(os.path.dirname(os.path.abspath(__file__)))
+    cache_model(model_name)
diff --git a/torchbenchmark/canary_models/llama_v2_7b/metadata.yaml b/torchbenchmark/canary_models/llama_v2_7b/metadata.yaml
@@ -0,0 +1,12 @@
+devices:
+  NVIDIA A100-SXM4-40GB:
+    eval_batch_size: 1
+eval_benchmark: false
+eval_deterministic: false
+eval_nograd: true
+not_implemented:
+- device: cpu
+- device: cuda
+  test: train
+train_benchmark: false
+train_deterministic: false
diff --git a/torchbenchmark/models/hf_MPT_7b_instruct/metadata.yaml b/torchbenchmark/models/hf_MPT_7b_instruct/metadata.yaml
@@ -4,7 +4,5 @@ devices:
 eval_benchmark: false
 eval_deterministic: false
 eval_nograd: true
-not_implemented:
-- jit: true
 train_benchmark: false
 train_deterministic: false
diff --git a/torchbenchmark/util/framework/huggingface/model_factory.py b/torchbenchmark/util/framework/huggingface/model_factory.py
@@ -31,6 +31,9 @@
     # default num_hidden_layers=32 but that OOMs, feel free to change this config to something more real
     'llama_v2_7b_16h' : (512,512, 'LlamaConfig(num_hidden_layers=16)', 'AutoModelForCausalLM'),
     'hf_MPT_7b_instruct': (512, 512, 'AutoConfig.from_pretrained("mosaicml/mpt-7b-instruct", trust_remote_code=True)', 'AutoModelForCausalLM'),
+    'llama_v2_7b' : (512,512, 'AutoConfig.from_pretrained("meta-llama/Llama-2-7b-hf")', 'AutoModelForCausalLM'),
+    'llama_v2_13b' : (512,512, 'AutoConfig.from_pretrained("meta-llama/Llama-2-13b-hf")', 'AutoModelForCausalLM'),
+    'llama_v2_70b' : (512, 512, 'AutoConfig.from_pretrained("meta-llama/Llama-2-70b-hf")', 'AutoModelForMaskedLM'),
 }
 
 cpu_input_slice = {

diff --git a/torchbenchmark/util/model.py b/torchbenchmark/util/model.py
@@ -162,10 +162,8 @@ def determine_batch_size(self, batch_size=None):
                 assert current_device_name, f"torch.cuda.get_device_name() returns None when device is set to cuda, please double check."
                 if current_device_name in SPECIAL_DEVICE_MAPPING:
                     current_device_name = SPECIAL_DEVICE_MAPPING[current_device_name]
-            elif self.device == "cpu":
-                current_device_name = "cpu"
-            elif self.device == "mps":
-                current_device_name = "mps"
+            else:
+                current_device_name = str(self.device)
             # use the device suggestion on CUDA inference tests, key should be either eval_batch_size or train_batch_size
             device_batch_size_key = f"{self.test}_batch_size"
             if self.metadata and "devices" in self.metadata and current_device_name in self.metadata["devices"] \