From 8dbc4ee093ae462fc151934bbb939f93b325dded Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+HMellor@users.noreply.github.com>
Date: Wed, 2 Aug 2023 09:29:38 -0700
Subject: [PATCH 1/2] Make `test.py` device agnostic (#1805)

Summary:
Allows the user to specify an arbitrary device and an arbitrary backend import using environment variables. The three added environment variables are:
- `TIMEOUT` - allows the timeout to be configured to suit the characteristics of the arbitrary device
- `ACCELERATOR` - a string which is added to the devices list, just like `'cpu'`, `'cuda'` & `'mps'` currently are
- `ACCELERATOR_BACKEND` - a string which is the name of the backend package (which doesn't yet exist in upstream `torch`) needed to use `ACCELERATOR`

These environment variables are used as follows:
```
TIMEOUT=100 ACCELERATOR=my_accelerator ACCELERATOR_BACKEND=my_accelerator_backend \
    python test.py -k "test_hf_Bert_check_device_my_accelerator"
```

Pull Request resolved: https://github.com/pytorch/benchmark/pull/1805

Reviewed By: msaroufim

Differential Revision: D47993178

Pulled By: xuzhao9

fbshipit-source-id: 1fe3af21e6192b0036eb88167b77eb994facddab
---
 test.py                      | 8 +++++---
 torchbenchmark/__init__.py   | 2 ++
 torchbenchmark/util/model.py | 6 ++----
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/test.py b/test.py
index 2fe291bd46..baadb19e47 100644
--- a/test.py
+++ b/test.py
@@ -20,10 +20,10 @@
 # Some of the models have very heavyweight setup, so we have to set a very
 # generous limit. That said, we don't want the entire test suite to hang if
 # a single test encounters an extreme failure, so we give up after a test is
-# unresponsive to 5 minutes. (Note: this does not require that the entire
-# test case completes in 5 minutes. It requires that if the worker is
+# unresponsive to 5 minutes by default. (Note: this does not require that the
+# entire test case completes in 5 minutes. It requires that if the worker is
 # unresponsive for 5 minutes the parent will presume it dead / incapacitated.)
-TIMEOUT = 300  # Seconds
+TIMEOUT = int(os.getenv("TIMEOUT", 300))  # Seconds
 
 class TestBenchmark(unittest.TestCase):
 
@@ -123,6 +123,8 @@ def _load_tests():
         devices.append('cuda')
     if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
         devices.append('mps')
+    if device := os.getenv('ACCELERATOR'):
+        devices.append(device)
 
     for path in _list_model_paths():
         # TODO: skipping quantized tests for now due to BC-breaking changes for prepare
diff --git a/torchbenchmark/__init__.py b/torchbenchmark/__init__.py
index b98c875af9..00eeac0443 100644
--- a/torchbenchmark/__init__.py
+++ b/torchbenchmark/__init__.py
@@ -295,6 +295,8 @@ def _maybe_import_model(package: str, model_path: str) -> Dict[str, Any]:
         diagnostic_msg = ""
         try:
             module = importlib.import_module(f'.models.{model_name}', package=package)
+            if accelerator_backend := os.getenv("ACCELERATOR_BACKEND"):
+                setattr(module, accelerator_backend, importlib.import_module(accelerator_backend))
             Model = getattr(module, 'Model', None)
             if Model is None:
                 diagnostic_msg = f"Warning: {module} does not define attribute Model, skip it"
diff --git a/torchbenchmark/util/model.py b/torchbenchmark/util/model.py
index f108f19f4a..d847408175 100644
--- a/torchbenchmark/util/model.py
+++ b/torchbenchmark/util/model.py
@@ -162,10 +162,8 @@ def determine_batch_size(self, batch_size=None):
                 assert current_device_name, f"torch.cuda.get_device_name() returns None when device is set to cuda, please double check."
                 if current_device_name in SPECIAL_DEVICE_MAPPING:
                     current_device_name = SPECIAL_DEVICE_MAPPING[current_device_name]
-            elif self.device == "cpu":
-                current_device_name = "cpu"
-            elif self.device == "mps":
-                current_device_name = "mps"
+            else:
+                current_device_name = str(self.device)
             # use the device suggestion on CUDA inference tests, key should be either eval_batch_size or train_batch_size
             device_batch_size_key = f"{self.test}_batch_size"
             if self.metadata and "devices" in self.metadata and current_device_name in self.metadata["devices"] \

From e7ca300f6af720ae998317a61be4ee8d2f3e770f Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Wed, 2 Aug 2023 15:18:30 -0700
Subject: [PATCH 2/2] all the llamas in canary (#1803)

Summary:
All of these OOM on a single device but want to make them all available for the distributed tests H-Huang is working on

Pull Request resolved: https://github.com/pytorch/benchmark/pull/1803

Reviewed By: H-Huang, xuzhao9

Differential Revision: D47994029

Pulled By: msaroufim

fbshipit-source-id: ad335348b911ddb2379a87a4ec9ef3b0dcc91ea0
---
 .../canary_models/llama_v2_13b/__init__.py       | 15 +++++++++++++++
 .../canary_models/llama_v2_13b/install.py        |  9 +++++++++
 .../canary_models/llama_v2_13b/metadata.yaml     | 12 ++++++++++++
 .../canary_models/llama_v2_70b/__init__.py       | 16 ++++++++++++++++
 .../canary_models/llama_v2_70b/install.py        |  9 +++++++++
 .../canary_models/llama_v2_70b/metadata.yaml     | 12 ++++++++++++
 .../canary_models/llama_v2_7b/__init__.py        | 16 ++++++++++++++++
 .../canary_models/llama_v2_7b/install.py         |  9 +++++++++
 .../canary_models/llama_v2_7b/metadata.yaml      | 12 ++++++++++++
 .../util/framework/huggingface/model_factory.py  |  3 +++
 10 files changed, 113 insertions(+)
 create mode 100644 torchbenchmark/canary_models/llama_v2_13b/__init__.py
 create mode 100644 torchbenchmark/canary_models/llama_v2_13b/install.py
 create mode 100644 torchbenchmark/canary_models/llama_v2_13b/metadata.yaml
 create mode 100644 torchbenchmark/canary_models/llama_v2_70b/__init__.py
 create mode 100644 torchbenchmark/canary_models/llama_v2_70b/install.py
 create mode 100644 torchbenchmark/canary_models/llama_v2_70b/metadata.yaml
 create mode 100644 torchbenchmark/canary_models/llama_v2_7b/__init__.py
 create mode 100644 torchbenchmark/canary_models/llama_v2_7b/install.py
 create mode 100644 torchbenchmark/canary_models/llama_v2_7b/metadata.yaml

diff --git a/torchbenchmark/canary_models/llama_v2_13b/__init__.py b/torchbenchmark/canary_models/llama_v2_13b/__init__.py
new file mode 100644
index 0000000000..bf307dfedf
--- /dev/null
+++ b/torchbenchmark/canary_models/llama_v2_13b/__init__.py
@@ -0,0 +1,15 @@
+from torchbenchmark.tasks import NLP
+from torchbenchmark.util.framework.huggingface.model_factory import HuggingFaceModel, HuggingFaceAuthMixin
+
+class Model(HuggingFaceModel, HuggingFaceAuthMixin):
+    task = NLP.LANGUAGE_MODELING
+    DEFAULT_TRAIN_BSIZE = 1
+    DEFAULT_EVAL_BSIZE = 1
+    DEEPCOPY = False 
+
+    def __init__(self, test, device, batch_size=None, extra_args=[]):
+        HuggingFaceAuthMixin.__init__(self)
+        super().__init__(name="llama_v2_13b", test=test, device=device, batch_size=batch_size, extra_args=extra_args)
+
+    def train(self):
+        return NotImplementedError("FSDP should implement a training loop")
\ No newline at end of file
diff --git a/torchbenchmark/canary_models/llama_v2_13b/install.py b/torchbenchmark/canary_models/llama_v2_13b/install.py
new file mode 100644
index 0000000000..cc27b6e7cf
--- /dev/null
+++ b/torchbenchmark/canary_models/llama_v2_13b/install.py
@@ -0,0 +1,9 @@
+
+import subprocess
+import sys
+import os
+from torchbenchmark.util.framework.huggingface.patch_hf import patch_transformers, cache_model
+
+if __name__ == '__main__':
+    model_name = os.path.basename(os.path.dirname(os.path.abspath(__file__)))
+    cache_model(model_name)
\ No newline at end of file
diff --git a/torchbenchmark/canary_models/llama_v2_13b/metadata.yaml b/torchbenchmark/canary_models/llama_v2_13b/metadata.yaml
new file mode 100644
index 0000000000..db5866b059
--- /dev/null
+++ b/torchbenchmark/canary_models/llama_v2_13b/metadata.yaml
@@ -0,0 +1,12 @@
+devices:
+  NVIDIA A100-SXM4-40GB:
+    eval_batch_size: 1
+eval_benchmark: false
+eval_deterministic: false
+eval_nograd: true
+not_implemented:
+- device: cpu
+- device: cuda
+  test: train
+train_benchmark: false
+train_deterministic: false
\ No newline at end of file
diff --git a/torchbenchmark/canary_models/llama_v2_70b/__init__.py b/torchbenchmark/canary_models/llama_v2_70b/__init__.py
new file mode 100644
index 0000000000..6878b26c9d
--- /dev/null
+++ b/torchbenchmark/canary_models/llama_v2_70b/__init__.py
@@ -0,0 +1,16 @@
+from torchbenchmark.tasks import NLP
+from torchbenchmark.util.framework.huggingface.model_factory import HuggingFaceModel, HuggingFaceAuthMixin
+
+class Model(HuggingFaceModel, HuggingFaceAuthMixin):
+    task = NLP.LANGUAGE_MODELING
+    DEFAULT_TRAIN_BSIZE = 1
+    DEFAULT_EVAL_BSIZE = 1
+    DEEPCOPY = False 
+
+    def __init__(self, test, device, batch_size=None, extra_args=[]):
+        HuggingFaceAuthMixin.__init__(self)
+        super().__init__(name="llama_v2_70b", test=test, device=device, batch_size=batch_size, extra_args=extra_args)
+
+  
+    def train(self):
+        return NotImplementedError("FSDP should implement a training loop")
diff --git a/torchbenchmark/canary_models/llama_v2_70b/install.py b/torchbenchmark/canary_models/llama_v2_70b/install.py
new file mode 100644
index 0000000000..cc27b6e7cf
--- /dev/null
+++ b/torchbenchmark/canary_models/llama_v2_70b/install.py
@@ -0,0 +1,9 @@
+
+import subprocess
+import sys
+import os
+from torchbenchmark.util.framework.huggingface.patch_hf import patch_transformers, cache_model
+
+if __name__ == '__main__':
+    model_name = os.path.basename(os.path.dirname(os.path.abspath(__file__)))
+    cache_model(model_name)
\ No newline at end of file
diff --git a/torchbenchmark/canary_models/llama_v2_70b/metadata.yaml b/torchbenchmark/canary_models/llama_v2_70b/metadata.yaml
new file mode 100644
index 0000000000..db5866b059
--- /dev/null
+++ b/torchbenchmark/canary_models/llama_v2_70b/metadata.yaml
@@ -0,0 +1,12 @@
+devices:
+  NVIDIA A100-SXM4-40GB:
+    eval_batch_size: 1
+eval_benchmark: false
+eval_deterministic: false
+eval_nograd: true
+not_implemented:
+- device: cpu
+- device: cuda
+  test: train
+train_benchmark: false
+train_deterministic: false
\ No newline at end of file
diff --git a/torchbenchmark/canary_models/llama_v2_7b/__init__.py b/torchbenchmark/canary_models/llama_v2_7b/__init__.py
new file mode 100644
index 0000000000..326a47ef4e
--- /dev/null
+++ b/torchbenchmark/canary_models/llama_v2_7b/__init__.py
@@ -0,0 +1,16 @@
+from torchbenchmark.tasks import NLP
+from torchbenchmark.util.framework.huggingface.model_factory import HuggingFaceModel, HuggingFaceAuthMixin
+
+class Model(HuggingFaceModel, HuggingFaceAuthMixin):
+    task = NLP.LANGUAGE_MODELING
+    DEFAULT_TRAIN_BSIZE = 1
+    DEFAULT_EVAL_BSIZE = 1
+    DEEPCOPY = False 
+
+    def __init__(self, test, device, batch_size=None, extra_args=[]):
+        HuggingFaceAuthMixin.__init__(self)
+        super().__init__(name="llama_v2_7b", test=test, device=device, batch_size=batch_size, extra_args=extra_args)
+
+
+    def train(self):
+        return NotImplementedError("FSDP should implement a training loop")
diff --git a/torchbenchmark/canary_models/llama_v2_7b/install.py b/torchbenchmark/canary_models/llama_v2_7b/install.py
new file mode 100644
index 0000000000..cc27b6e7cf
--- /dev/null
+++ b/torchbenchmark/canary_models/llama_v2_7b/install.py
@@ -0,0 +1,9 @@
+
+import subprocess
+import sys
+import os
+from torchbenchmark.util.framework.huggingface.patch_hf import patch_transformers, cache_model
+
+if __name__ == '__main__':
+    model_name = os.path.basename(os.path.dirname(os.path.abspath(__file__)))
+    cache_model(model_name)
\ No newline at end of file
diff --git a/torchbenchmark/canary_models/llama_v2_7b/metadata.yaml b/torchbenchmark/canary_models/llama_v2_7b/metadata.yaml
new file mode 100644
index 0000000000..db5866b059
--- /dev/null
+++ b/torchbenchmark/canary_models/llama_v2_7b/metadata.yaml
@@ -0,0 +1,12 @@
+devices:
+  NVIDIA A100-SXM4-40GB:
+    eval_batch_size: 1
+eval_benchmark: false
+eval_deterministic: false
+eval_nograd: true
+not_implemented:
+- device: cpu
+- device: cuda
+  test: train
+train_benchmark: false
+train_deterministic: false
\ No newline at end of file
diff --git a/torchbenchmark/util/framework/huggingface/model_factory.py b/torchbenchmark/util/framework/huggingface/model_factory.py
index 3327ad72db..d317483a67 100644
--- a/torchbenchmark/util/framework/huggingface/model_factory.py
+++ b/torchbenchmark/util/framework/huggingface/model_factory.py
@@ -30,6 +30,9 @@
     'hf_Whisper': (1024, 1024, 'WhisperConfig()', 'AutoModelForAudioClassification'),
     # default num_hidden_layers=32 but that OOMs, feel free to change this config to something more real
     'llama_v2_7b_16h' : (512,512, 'LlamaConfig(num_hidden_layers=16)', 'AutoModelForCausalLM'),
+    'llama_v2_7b' : (512,512, 'AutoConfig.from_pretrained("meta-llama/Llama-2-7b-hf")', 'AutoModelForCausalLM'),
+    'llama_v2_13b' : (512,512, 'AutoConfig.from_pretrained("meta-llama/Llama-2-13b-hf")', 'AutoModelForCausalLM'),
+    'llama_v2_70b' : (512, 512, 'AutoConfig.from_pretrained("meta-llama/Llama-2-70b-hf")', 'AutoModelForMaskedLM'),
 }
 
 cpu_input_slice = {