From 8dbc4ee093ae462fc151934bbb939f93b325dded Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+HMellor@users.noreply.github.com> Date: Wed, 2 Aug 2023 09:29:38 -0700 Subject: [PATCH 1/2] Make `test.py` device agnostic (#1805) Summary: Allows the user to specify an arbitrary device and an arbitrary backend import using environment variables. The three added environment variables are: - `TIMEOUT` - allows the timeout to be configured to suit the characteristics of the arbitrary device - `ACCELERATOR` - a string which is added to the devices list, just like `'cpu'`, `'cuda'` & `'mps'` currently are - `ACCELERATOR_BACKEND` - a string which is the name of the backend package (which doesn't yet exist in upstream `torch`) needed to use `ACCELERATOR` These environment variables are used as follows: ``` TIMEOUT=100 ACCELERATOR=my_accelerator ACCELERATOR_BACKEND=my_accelerator_backend \ python test.py -k "test_hf_Bert_check_device_my_accelerator" ``` Pull Request resolved: https://github.com/pytorch/benchmark/pull/1805 Reviewed By: msaroufim Differential Revision: D47993178 Pulled By: xuzhao9 fbshipit-source-id: 1fe3af21e6192b0036eb88167b77eb994facddab --- test.py | 8 +++++--- torchbenchmark/__init__.py | 2 ++ torchbenchmark/util/model.py | 6 ++---- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/test.py b/test.py index 2fe291bd46..baadb19e47 100644 --- a/test.py +++ b/test.py @@ -20,10 +20,10 @@ # Some of the models have very heavyweight setup, so we have to set a very # generous limit. That said, we don't want the entire test suite to hang if # a single test encounters an extreme failure, so we give up after a test is -# unresponsive to 5 minutes. (Note: this does not require that the entire -# test case completes in 5 minutes. It requires that if the worker is +# unresponsive to 5 minutes by default. (Note: this does not require that the +# entire test case completes in 5 minutes. It requires that if the worker is # unresponsive for 5 minutes the parent will presume it dead / incapacitated.) -TIMEOUT = 300 # Seconds +TIMEOUT = int(os.getenv("TIMEOUT", 300)) # Seconds class TestBenchmark(unittest.TestCase): @@ -123,6 +123,8 @@ def _load_tests(): devices.append('cuda') if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): devices.append('mps') + if device := os.getenv('ACCELERATOR'): + devices.append(device) for path in _list_model_paths(): # TODO: skipping quantized tests for now due to BC-breaking changes for prepare diff --git a/torchbenchmark/__init__.py b/torchbenchmark/__init__.py index b98c875af9..00eeac0443 100644 --- a/torchbenchmark/__init__.py +++ b/torchbenchmark/__init__.py @@ -295,6 +295,8 @@ def _maybe_import_model(package: str, model_path: str) -> Dict[str, Any]: diagnostic_msg = "" try: module = importlib.import_module(f'.models.{model_name}', package=package) + if accelerator_backend := os.getenv("ACCELERATOR_BACKEND"): + setattr(module, accelerator_backend, importlib.import_module(accelerator_backend)) Model = getattr(module, 'Model', None) if Model is None: diagnostic_msg = f"Warning: {module} does not define attribute Model, skip it" diff --git a/torchbenchmark/util/model.py b/torchbenchmark/util/model.py index f108f19f4a..d847408175 100644 --- a/torchbenchmark/util/model.py +++ b/torchbenchmark/util/model.py @@ -162,10 +162,8 @@ def determine_batch_size(self, batch_size=None): assert current_device_name, f"torch.cuda.get_device_name() returns None when device is set to cuda, please double check." if current_device_name in SPECIAL_DEVICE_MAPPING: current_device_name = SPECIAL_DEVICE_MAPPING[current_device_name] - elif self.device == "cpu": - current_device_name = "cpu" - elif self.device == "mps": - current_device_name = "mps" + else: + current_device_name = str(self.device) # use the device suggestion on CUDA inference tests, key should be either eval_batch_size or train_batch_size device_batch_size_key = f"{self.test}_batch_size" if self.metadata and "devices" in self.metadata and current_device_name in self.metadata["devices"] \ From e7ca300f6af720ae998317a61be4ee8d2f3e770f Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 2 Aug 2023 15:18:30 -0700 Subject: [PATCH 2/2] all the llamas in canary (#1803) Summary: All of these OOM on a single device but want to make them all available for the distributed tests H-Huang is working on Pull Request resolved: https://github.com/pytorch/benchmark/pull/1803 Reviewed By: H-Huang, xuzhao9 Differential Revision: D47994029 Pulled By: msaroufim fbshipit-source-id: ad335348b911ddb2379a87a4ec9ef3b0dcc91ea0 --- .../canary_models/llama_v2_13b/__init__.py | 15 +++++++++++++++ .../canary_models/llama_v2_13b/install.py | 9 +++++++++ .../canary_models/llama_v2_13b/metadata.yaml | 12 ++++++++++++ .../canary_models/llama_v2_70b/__init__.py | 16 ++++++++++++++++ .../canary_models/llama_v2_70b/install.py | 9 +++++++++ .../canary_models/llama_v2_70b/metadata.yaml | 12 ++++++++++++ .../canary_models/llama_v2_7b/__init__.py | 16 ++++++++++++++++ .../canary_models/llama_v2_7b/install.py | 9 +++++++++ .../canary_models/llama_v2_7b/metadata.yaml | 12 ++++++++++++ .../util/framework/huggingface/model_factory.py | 3 +++ 10 files changed, 113 insertions(+) create mode 100644 torchbenchmark/canary_models/llama_v2_13b/__init__.py create mode 100644 torchbenchmark/canary_models/llama_v2_13b/install.py create mode 100644 torchbenchmark/canary_models/llama_v2_13b/metadata.yaml create mode 100644 torchbenchmark/canary_models/llama_v2_70b/__init__.py create mode 100644 torchbenchmark/canary_models/llama_v2_70b/install.py create mode 100644 torchbenchmark/canary_models/llama_v2_70b/metadata.yaml create mode 100644 torchbenchmark/canary_models/llama_v2_7b/__init__.py create mode 100644 torchbenchmark/canary_models/llama_v2_7b/install.py create mode 100644 torchbenchmark/canary_models/llama_v2_7b/metadata.yaml diff --git a/torchbenchmark/canary_models/llama_v2_13b/__init__.py b/torchbenchmark/canary_models/llama_v2_13b/__init__.py new file mode 100644 index 0000000000..bf307dfedf --- /dev/null +++ b/torchbenchmark/canary_models/llama_v2_13b/__init__.py @@ -0,0 +1,15 @@ +from torchbenchmark.tasks import NLP +from torchbenchmark.util.framework.huggingface.model_factory import HuggingFaceModel, HuggingFaceAuthMixin + +class Model(HuggingFaceModel, HuggingFaceAuthMixin): + task = NLP.LANGUAGE_MODELING + DEFAULT_TRAIN_BSIZE = 1 + DEFAULT_EVAL_BSIZE = 1 + DEEPCOPY = False + + def __init__(self, test, device, batch_size=None, extra_args=[]): + HuggingFaceAuthMixin.__init__(self) + super().__init__(name="llama_v2_13b", test=test, device=device, batch_size=batch_size, extra_args=extra_args) + + def train(self): + return NotImplementedError("FSDP should implement a training loop") \ No newline at end of file diff --git a/torchbenchmark/canary_models/llama_v2_13b/install.py b/torchbenchmark/canary_models/llama_v2_13b/install.py new file mode 100644 index 0000000000..cc27b6e7cf --- /dev/null +++ b/torchbenchmark/canary_models/llama_v2_13b/install.py @@ -0,0 +1,9 @@ + +import subprocess +import sys +import os +from torchbenchmark.util.framework.huggingface.patch_hf import patch_transformers, cache_model + +if __name__ == '__main__': + model_name = os.path.basename(os.path.dirname(os.path.abspath(__file__))) + cache_model(model_name) \ No newline at end of file diff --git a/torchbenchmark/canary_models/llama_v2_13b/metadata.yaml b/torchbenchmark/canary_models/llama_v2_13b/metadata.yaml new file mode 100644 index 0000000000..db5866b059 --- /dev/null +++ b/torchbenchmark/canary_models/llama_v2_13b/metadata.yaml @@ -0,0 +1,12 @@ +devices: + NVIDIA A100-SXM4-40GB: + eval_batch_size: 1 +eval_benchmark: false +eval_deterministic: false +eval_nograd: true +not_implemented: +- device: cpu +- device: cuda + test: train +train_benchmark: false +train_deterministic: false \ No newline at end of file diff --git a/torchbenchmark/canary_models/llama_v2_70b/__init__.py b/torchbenchmark/canary_models/llama_v2_70b/__init__.py new file mode 100644 index 0000000000..6878b26c9d --- /dev/null +++ b/torchbenchmark/canary_models/llama_v2_70b/__init__.py @@ -0,0 +1,16 @@ +from torchbenchmark.tasks import NLP +from torchbenchmark.util.framework.huggingface.model_factory import HuggingFaceModel, HuggingFaceAuthMixin + +class Model(HuggingFaceModel, HuggingFaceAuthMixin): + task = NLP.LANGUAGE_MODELING + DEFAULT_TRAIN_BSIZE = 1 + DEFAULT_EVAL_BSIZE = 1 + DEEPCOPY = False + + def __init__(self, test, device, batch_size=None, extra_args=[]): + HuggingFaceAuthMixin.__init__(self) + super().__init__(name="llama_v2_70b", test=test, device=device, batch_size=batch_size, extra_args=extra_args) + + + def train(self): + return NotImplementedError("FSDP should implement a training loop") diff --git a/torchbenchmark/canary_models/llama_v2_70b/install.py b/torchbenchmark/canary_models/llama_v2_70b/install.py new file mode 100644 index 0000000000..cc27b6e7cf --- /dev/null +++ b/torchbenchmark/canary_models/llama_v2_70b/install.py @@ -0,0 +1,9 @@ + +import subprocess +import sys +import os +from torchbenchmark.util.framework.huggingface.patch_hf import patch_transformers, cache_model + +if __name__ == '__main__': + model_name = os.path.basename(os.path.dirname(os.path.abspath(__file__))) + cache_model(model_name) \ No newline at end of file diff --git a/torchbenchmark/canary_models/llama_v2_70b/metadata.yaml b/torchbenchmark/canary_models/llama_v2_70b/metadata.yaml new file mode 100644 index 0000000000..db5866b059 --- /dev/null +++ b/torchbenchmark/canary_models/llama_v2_70b/metadata.yaml @@ -0,0 +1,12 @@ +devices: + NVIDIA A100-SXM4-40GB: + eval_batch_size: 1 +eval_benchmark: false +eval_deterministic: false +eval_nograd: true +not_implemented: +- device: cpu +- device: cuda + test: train +train_benchmark: false +train_deterministic: false \ No newline at end of file diff --git a/torchbenchmark/canary_models/llama_v2_7b/__init__.py b/torchbenchmark/canary_models/llama_v2_7b/__init__.py new file mode 100644 index 0000000000..326a47ef4e --- /dev/null +++ b/torchbenchmark/canary_models/llama_v2_7b/__init__.py @@ -0,0 +1,16 @@ +from torchbenchmark.tasks import NLP +from torchbenchmark.util.framework.huggingface.model_factory import HuggingFaceModel, HuggingFaceAuthMixin + +class Model(HuggingFaceModel, HuggingFaceAuthMixin): + task = NLP.LANGUAGE_MODELING + DEFAULT_TRAIN_BSIZE = 1 + DEFAULT_EVAL_BSIZE = 1 + DEEPCOPY = False + + def __init__(self, test, device, batch_size=None, extra_args=[]): + HuggingFaceAuthMixin.__init__(self) + super().__init__(name="llama_v2_7b", test=test, device=device, batch_size=batch_size, extra_args=extra_args) + + + def train(self): + return NotImplementedError("FSDP should implement a training loop") diff --git a/torchbenchmark/canary_models/llama_v2_7b/install.py b/torchbenchmark/canary_models/llama_v2_7b/install.py new file mode 100644 index 0000000000..cc27b6e7cf --- /dev/null +++ b/torchbenchmark/canary_models/llama_v2_7b/install.py @@ -0,0 +1,9 @@ + +import subprocess +import sys +import os +from torchbenchmark.util.framework.huggingface.patch_hf import patch_transformers, cache_model + +if __name__ == '__main__': + model_name = os.path.basename(os.path.dirname(os.path.abspath(__file__))) + cache_model(model_name) \ No newline at end of file diff --git a/torchbenchmark/canary_models/llama_v2_7b/metadata.yaml b/torchbenchmark/canary_models/llama_v2_7b/metadata.yaml new file mode 100644 index 0000000000..db5866b059 --- /dev/null +++ b/torchbenchmark/canary_models/llama_v2_7b/metadata.yaml @@ -0,0 +1,12 @@ +devices: + NVIDIA A100-SXM4-40GB: + eval_batch_size: 1 +eval_benchmark: false +eval_deterministic: false +eval_nograd: true +not_implemented: +- device: cpu +- device: cuda + test: train +train_benchmark: false +train_deterministic: false \ No newline at end of file diff --git a/torchbenchmark/util/framework/huggingface/model_factory.py b/torchbenchmark/util/framework/huggingface/model_factory.py index 3327ad72db..d317483a67 100644 --- a/torchbenchmark/util/framework/huggingface/model_factory.py +++ b/torchbenchmark/util/framework/huggingface/model_factory.py @@ -30,6 +30,9 @@ 'hf_Whisper': (1024, 1024, 'WhisperConfig()', 'AutoModelForAudioClassification'), # default num_hidden_layers=32 but that OOMs, feel free to change this config to something more real 'llama_v2_7b_16h' : (512,512, 'LlamaConfig(num_hidden_layers=16)', 'AutoModelForCausalLM'), + 'llama_v2_7b' : (512,512, 'AutoConfig.from_pretrained("meta-llama/Llama-2-7b-hf")', 'AutoModelForCausalLM'), + 'llama_v2_13b' : (512,512, 'AutoConfig.from_pretrained("meta-llama/Llama-2-13b-hf")', 'AutoModelForCausalLM'), + 'llama_v2_70b' : (512, 512, 'AutoConfig.from_pretrained("meta-llama/Llama-2-70b-hf")', 'AutoModelForMaskedLM'), } cpu_input_slice = {