easybuilders · boegel · Aug 8, 2023 · Jul 24, 2023 · Jul 24, 2023 · Jul 25, 2023
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1-foss-2022a.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1-foss-2022a.eb
@@ -12,31 +12,30 @@ sources = ['%(namelower)s-v%(version)s.tar.gz']
 patches = [
     'PyTorch-1.7.0_disable-dev-shm-test.patch',
     'PyTorch-1.10.0_fix-kineto-crash.patch',
-    'PyTorch-1.11.0_fix-fsdp-fp16-test.patch',
     'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
     'PyTorch-1.12.1_add-hypothesis-suppression.patch',
     'PyTorch-1.12.1_fix-skip-decorators.patch',
     'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
     'PyTorch-1.12.1_fix-test_wishart_log_prob.patch',
     'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
     'PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch',
-    'PyTorch-1.12.1_fix-vsx-vector-funcs.patch',
     'PyTorch-1.12.1_fix-vsx-loadu.patch',
+    'PyTorch-1.12.1_fix-vsx-vector-funcs.patch',
     'PyTorch-1.12.1_skip-test_round_robin.patch',
+    'PyTorch-1.13.1_fix-fsdp-fp16-test.patch',
     'PyTorch-1.13.1_fix-pytest-args.patch',
     'PyTorch-1.13.1_fix-test-ops-conf.patch',
-    'PyTorch-1.13.1_no-cuda-stubs-rpath.patch',
-    'PyTorch-1.13.1_remove-flaky-test-in-testnn.patch',
-    'PyTorch-1.13.1_skip-ao-sparsity-test-without-fbgemm.patch',
     'PyTorch-1.13.1_increase-tolerance-test_ops.patch',
     'PyTorch-1.13.1_install-vsx-vec-headers.patch',
+    'PyTorch-1.13.1_no-cuda-stubs-rpath.patch',
+    'PyTorch-1.13.1_remove-flaky-test-in-testnn.patch',
     'PyTorch-1.13.1_skip-failing-grad-test.patch',
+    'PyTorch-1.13.1_skip-tests-without-fbgemm.patch',
 ]
 checksums = [
     {'pytorch-v1.13.1.tar.gz': 'dbc229ee9750b02b514937d017744443a269ea0241ed3f32b9af0703589d25d4'},
     {'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
     {'PyTorch-1.10.0_fix-kineto-crash.patch': 'dc467333b28162149af8f675929d8c6bf219f23230bfc0d39af02ba4f6f882eb'},
-    {'PyTorch-1.11.0_fix-fsdp-fp16-test.patch': 'bb1c4e6d6fd4b0cf57ff8b824c797331b533bb1ffc63f5db0bae3aee10c3dc13'},
     {'PyTorch-1.11.1_skip-test_init_from_local_shards.patch':
      '4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'},
     {'PyTorch-1.12.1_add-hypothesis-suppression.patch':
@@ -49,21 +48,22 @@ checksums = [
     {'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'},
     {'PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch':
      '0bd7e88b92c4c6f0fecf01746009858ba19f2df68b10b88c41485328a531875d'},
-    {'PyTorch-1.12.1_fix-vsx-vector-funcs.patch': 'caccbf60f62eac313896c1eaec78b08f5d0fdfcb907079087490bb13d1561aa2'},
     {'PyTorch-1.12.1_fix-vsx-loadu.patch': '8bfe3c94ada1dd1f7974a1261a8b576fb7ae944050fa1c7830fca033831123b2'},
+    {'PyTorch-1.12.1_fix-vsx-vector-funcs.patch': 'caccbf60f62eac313896c1eaec78b08f5d0fdfcb907079087490bb13d1561aa2'},
     {'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
+    {'PyTorch-1.13.1_fix-fsdp-fp16-test.patch': '8ae68e60d6e1f92f50322b7f0381c7e65251fba32d7606e3a238a36a2f55b5cf'},
     {'PyTorch-1.13.1_fix-pytest-args.patch': 'd3e3c841cf8d73683750f29326f2be56ee0bb5df7ff522baf7d7c3f301a91ec2'},
     {'PyTorch-1.13.1_fix-test-ops-conf.patch': 'df652eec7753864ebebbfeca546929a53e3fb8f24259d5c9b964266a8551198c'},
-    {'PyTorch-1.13.1_no-cuda-stubs-rpath.patch': '4c636059850fc9d1ecb27ce275f8aad5d5b6fdc19e35aff0c25b86cb3201352a'},
-    {'PyTorch-1.13.1_remove-flaky-test-in-testnn.patch':
-     'be83ff61fe2dedab6d49c232936d5622df81ab49154264490021c6c828e53315'},
-    {'PyTorch-1.13.1_skip-ao-sparsity-test-without-fbgemm.patch':
-     '92cd48ef6d01aa7e07ccce1dcaf40bc3fb0f220c4aa4fea15f3e05fb42e37909'},
     {'PyTorch-1.13.1_increase-tolerance-test_ops.patch':
-     'd53e98bf0da7788b68042dcc31bc5708dae962fde3f110cc827eb807a5d08e49'},
+     'c909fdfc2b12df457e1eb5514265ffec3eab653994949416f3f048668421e223'},
     {'PyTorch-1.13.1_install-vsx-vec-headers.patch':
      '7b678f54bb947afd4767f5877ac424b4b94ce5db609ea20f5a869ccf4027035f'},
+    {'PyTorch-1.13.1_no-cuda-stubs-rpath.patch': '4c636059850fc9d1ecb27ce275f8aad5d5b6fdc19e35aff0c25b86cb3201352a'},
+    {'PyTorch-1.13.1_remove-flaky-test-in-testnn.patch':
+     'be83ff61fe2dedab6d49c232936d5622df81ab49154264490021c6c828e53315'},
     {'PyTorch-1.13.1_skip-failing-grad-test.patch': '6681200f9509893cb9231b5c93ac9bc5e6d9d9ae4febefca52e7cbc843ba8f51'},
+    {'PyTorch-1.13.1_skip-tests-without-fbgemm.patch':
+     '481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'},
 ]
 
 osdependencies = [OS_PKG_IBVERBS_DEV]

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_fix-fsdp-fp16-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_fix-fsdp-fp16-test.patch
@@ -0,0 +1,21 @@
+The test fails on a node with more than 5 V100 GPUs or more than 4 A100 GPUs.
+Hence limit the world_size to 4
+See https://github.com/pytorch/pytorch/pull/86280
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/fsdp/test_fsdp_pure_fp16.py b/test/distributed/fsdp/test_fsdp_pure_fp16.py
+index 1c663f8263354..e0033ef3d4b72 100644
+--- a/test/distributed/fsdp/test_fsdp_pure_fp16.py
++++ b/test/distributed/fsdp/test_fsdp_pure_fp16.py
+@@ -34,8 +34,8 @@
+ class TestPureFP16(FSDPTest):
+     @property
+     def world_size(self):
+-        # Test fails due to inaccuracies when using more than 5 GPUs
+-        return min(5, super().world_size)
++        # Test fails due to inaccuracies when using more than 4 GPUs
++        return min(4, super().world_size)
+
+     @skip_if_lt_x_gpu(2)
+     @parametrize(
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_increase-tolerance-test_ops.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_increase-tolerance-test_ops.patch
@@ -4,18 +4,24 @@
 > Greatest absolute difference: 1.430511474609375e-05 at index (4, 5) (up to 1e-05 allowed)
 > Greatest relative difference: 4.65393206065873e-06 at index (4, 5) (up to 1.3e-06 allowed)
 
+See https://github.com/pytorch/pytorch/pull/86365
+
 Author: Alexander Grund (TU Dresden)
-Updated for PyTorch 1.13.1: Simon Branford (University of Birmingham)
 
---- a/test/test_ops.py
-+++ b/test/test_ops.py
-@@ -545,6 +545,9 @@
-             else list(supported_dtypes)[0]
-         )
-
-+        if dtype is torch.float32:
-+            self.precision, self.rel_tol = (1.5e-05, 1e-05)
-+
-         samples = op.sample_inputs(device, dtype)
-         for sample in samples:
-             # calls it normally to get the expected result
+diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
+index 4b2d0ebabc46b..bab7843a72b74 100644
+--- a/torch/testing/_internal/common_methods_invocations.py
++++ b/torch/testing/_internal/common_methods_invocations.py
+@@ -8503,7 +8503,11 @@ op_db: List[OpInfo] = [
+                DecorateInfo(
+                    toleranceOverride({torch.float32: tol(atol=1.3e-05, rtol=1.3e-05),
+                                       torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
+-                   'TestCommon', 'test_numpy_refs')],
++                   'TestCommon', 'test_numpy_refs'),
++               DecorateInfo(
++                   toleranceOverride({torch.float32: tol(atol=1.5e-05, rtol=1e-05)}),
++                   'TestCommon', 'test_out'),
++           ],
+            skips=(
+                # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3
+                DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater),
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_skip-ao-sparsity-test-without-fbgemm.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_skip-ao-sparsity-test-without-fbgemm.patch
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_skip-tests-without-fbgemm.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_skip-tests-without-fbgemm.patch
@@ -0,0 +1,68 @@
+Those tests (from test_ao_sparsity & test_quantization) require FBGEMM which may not be available.
+So add the skip decorator.
+See https://github.com/pytorch/pytorch/issues/87364
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/ao/sparsity/test_composability.py b/test/ao/sparsity/test_composability.py
+index 6a1b6067a4c..0c43f585af2 100644
+--- a/test/ao/sparsity/test_composability.py
++++ b/test/ao/sparsity/test_composability.py
+@@ -9,6 +9,7 @@ import torch.ao.quantization as tq
+ from torch import nn
+ from torch.ao import sparsity
+ from torch.testing._internal.common_utils import TestCase
++from torch.testing._internal.common_quantization import skipIfNoFBGEMM
+ from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx, convert_to_reference_fx, prepare_qat_fx
+ from torch.ao.sparsity import fqn_to_module
+
+@@ -62,6 +63,7 @@ def _calculate_sparsity(tensor):
+ # This series of tests are to check the composability goals for sparsity and quantization. Namely
+ # that performing quantization and sparsity model manipulations in various orderings
+ # does not cause problems
++@skipIfNoFBGEMM
+ class TestComposability(TestCase):
+     # This test checks whether performing quantization prepare before sparse prepare
+     # causes any issues and verifies that the correct observers are inserted and that
+@@ -326,6 +328,7 @@ class TestFxComposability(TestCase):
+     r"""This series of tests checks that various steps of the quantization and sparsity flow
+     compose cleanly despite variation in sequencing.
+     """
++    @skipIfNoFBGEMM
+     def test_q_prep_fx_before_s_prep(self):
+         r"""
+         This test checks that the ordering of prepare_fx -> sparse prepare -> convert_fx
+@@ -445,6 +448,7 @@ class TestFxComposability(TestCase):
+         )
+         self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
+
++    @skipIfNoFBGEMM
+     def test_s_prep_before_q_prep_fx(self):
+         r"""
+         This test checks that the ordering of sparse prepare -> prepare_fx -> convert_fx
+@@ -490,6 +494,7 @@ class TestFxComposability(TestCase):
+         )
+         self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
+
++    @skipIfNoFBGEMM
+     def test_s_prep_before_qat_prep_fx(self):
+         r"""
+         This test checks that the ordering of sparse prepare -> prepare_qat_fx -> convert_fx
+diff --git a/test/quantization/core/test_docs.py b/test/quantization/core/test_docs.py
+index 27842b46ce7..8e50ffa3166 100644
+--- a/test/quantization/core/test_docs.py
++++ b/test/quantization/core/test_docs.py
+@@ -10,11 +10,13 @@ import torch
+ from torch.testing._internal.common_quantization import (
+     QuantizationTestCase,
+     SingleLayerLinearModel,
++    skipIfNoFBGEMM,
+ )
+ from torch.testing._internal.common_quantized import override_quantized_engine
+ from torch.testing._internal.common_utils import IS_ARM64
+
+
++@skipIfNoFBGEMM
+ class TestQuantizationDocs(QuantizationTestCase):
+     r"""
+     The tests in this section import code from the quantization docs and check that