Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{ai}[foss/2022a] PyTorch v1.13.1 w/ Python 3.10.4 #17155

Merged
116 changes: 116 additions & 0 deletions easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1-foss-2022a.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
name = 'PyTorch'
version = '1.13.1'

homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""

toolchain = {'name': 'foss', 'version': '2022a'}

source_urls = [GITHUB_RELEASE]
sources = ['%(namelower)s-v%(version)s.tar.gz']
patches = [
'PyTorch-1.7.0_disable-dev-shm-test.patch',
'PyTorch-1.10.0_fix-kineto-crash.patch',
'PyTorch-1.11.0_fix-fsdp-fp16-test.patch',
'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
'PyTorch-1.12.1_add-hypothesis-suppression.patch',
'PyTorch-1.12.1_fix-skip-decorators.patch',
'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
'PyTorch-1.12.1_fix-test_wishart_log_prob.patch',
'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
'PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch',
'PyTorch-1.12.1_fix-vsx-vector-funcs.patch',
'PyTorch-1.12.1_fix-vsx-loadu.patch',
'PyTorch-1.12.1_skip-test_round_robin.patch',
'PyTorch-1.13.1_fix-pytest-args.patch',
'PyTorch-1.13.1_fix-test-ops-conf.patch',
'PyTorch-1.13.1_no-cuda-stubs-rpath.patch',
'PyTorch-1.13.1_remove-flaky-test-in-testnn.patch',
'PyTorch-1.13.1_skip-ao-sparsity-test-without-fbgemm.patch',
'PyTorch-1.13.1_increase-tolerance-test_ops.patch',
'PyTorch-1.13.1_install-vsx-vec-headers.patch',
'PyTorch-1.13.1_skip-failing-grad-test.patch',
]
checksums = [
{'pytorch-v1.13.1.tar.gz': 'dbc229ee9750b02b514937d017744443a269ea0241ed3f32b9af0703589d25d4'},
{'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
{'PyTorch-1.10.0_fix-kineto-crash.patch': 'dc467333b28162149af8f675929d8c6bf219f23230bfc0d39af02ba4f6f882eb'},
{'PyTorch-1.11.0_fix-fsdp-fp16-test.patch': 'bb1c4e6d6fd4b0cf57ff8b824c797331b533bb1ffc63f5db0bae3aee10c3dc13'},
{'PyTorch-1.11.1_skip-test_init_from_local_shards.patch':
'4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'},
{'PyTorch-1.12.1_add-hypothesis-suppression.patch':
'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
{'PyTorch-1.12.1_fix-skip-decorators.patch': 'e3ca6e42b2fa592ea095939fb59ab875668a058479407db3f3684cc5c6f4146c'},
{'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch':
'1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'},
{'PyTorch-1.12.1_fix-test_wishart_log_prob.patch':
'cf475ae6e6234b96c8d1bf917597c5176c94b3ccd940b72f2e1cd0c979580f45'},
{'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'},
{'PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch':
'0bd7e88b92c4c6f0fecf01746009858ba19f2df68b10b88c41485328a531875d'},
{'PyTorch-1.12.1_fix-vsx-vector-funcs.patch': 'caccbf60f62eac313896c1eaec78b08f5d0fdfcb907079087490bb13d1561aa2'},
{'PyTorch-1.12.1_fix-vsx-loadu.patch': '8bfe3c94ada1dd1f7974a1261a8b576fb7ae944050fa1c7830fca033831123b2'},
{'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
{'PyTorch-1.13.1_fix-pytest-args.patch': 'd3e3c841cf8d73683750f29326f2be56ee0bb5df7ff522baf7d7c3f301a91ec2'},
{'PyTorch-1.13.1_fix-test-ops-conf.patch': 'df652eec7753864ebebbfeca546929a53e3fb8f24259d5c9b964266a8551198c'},
{'PyTorch-1.13.1_no-cuda-stubs-rpath.patch': '4c636059850fc9d1ecb27ce275f8aad5d5b6fdc19e35aff0c25b86cb3201352a'},
{'PyTorch-1.13.1_remove-flaky-test-in-testnn.patch':
'be83ff61fe2dedab6d49c232936d5622df81ab49154264490021c6c828e53315'},
{'PyTorch-1.13.1_skip-ao-sparsity-test-without-fbgemm.patch':
'92cd48ef6d01aa7e07ccce1dcaf40bc3fb0f220c4aa4fea15f3e05fb42e37909'},
{'PyTorch-1.13.1_increase-tolerance-test_ops.patch':
'd53e98bf0da7788b68042dcc31bc5708dae962fde3f110cc827eb807a5d08e49'},
{'PyTorch-1.13.1_install-vsx-vec-headers.patch':
'7b678f54bb947afd4767f5877ac424b4b94ce5db609ea20f5a869ccf4027035f'},
{'PyTorch-1.13.1_skip-failing-grad-test.patch': '6681200f9509893cb9231b5c93ac9bc5e6d9d9ae4febefca52e7cbc843ba8f51'},
]

osdependencies = [OS_PKG_IBVERBS_DEV]

builddependencies = [
('CMake', '3.23.1'),
('hypothesis', '6.46.7'),
# For tests
('pytest-rerunfailures', '11.1'),
('pytest-shard', '0.1.2'),
]

dependencies = [
('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions
('Python', '3.10.4'),
('protobuf', '3.19.4'),
('protobuf-python', '3.19.4'),
('pybind11', '2.9.2'),
('SciPy-bundle', '2022.05'),
('PyYAML', '6.0'),
('MPFR', '4.1.0'),
('GMP', '6.2.1'),
('numactl', '2.0.14'),
('FFmpeg', '4.4.2'),
('Pillow', '9.1.1'),
('expecttest', '0.1.3'),
]

excluded_tests = {
'': [
# This test seems to take too long on NVIDIA Ampere at least.
'distributed/test_distributed_spawn',
# Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375
'distributions/test_constraints',
# no xdoctest
'doctests',
boegel marked this conversation as resolved.
Show resolved Hide resolved
# failing on broadwell
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'test_native_mha',
# intermittent failures on various systems
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'distributed/rpc/test_tensorpipe_agent',
]
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'

tests = ['PyTorch-check-cpp-extension.py']

moduleclass = 'ai'
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
As we don't set `--save-xml` pytest is called without arguments causing it to try to discover ALL tests.
This leads to massive failures in e.g. `test_ops*` where `--use-pytest` is used by the tests.
See https://github.com/pytorch/pytorch/pull/94589

Author: Alexander Grund (TU Dresden)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index e32850908d4..e63c6f2a392 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -737,14 +737,16 @@ def run_tests(argv=UNITTEST_ARGS):
failed |= wait_for_process(p) != 0
assert not failed, "Some test shards have failed"
elif USE_PYTEST:
+ pytest_args = argv
if TEST_SAVE_XML:
test_report_path = get_report_path(pytest=True)
print(f'Test results will be stored in {test_report_path}')
+ pytest_args = pytest_args + [f'--junit-xml-reruns={test_report_path}']

import pytest
os.environ["NO_COLOR"] = "1"
os.environ["USING_PYTEST"] = "1"
- exit_code = pytest.main(args=argv + [f'--junit-xml-reruns={test_report_path}'] if TEST_SAVE_XML else [])
+ exit_code = pytest.main(args=pytest_args)
del os.environ["USING_PYTEST"]
if TEST_SAVE_XML:
sanitize_pytest_xml(test_report_path)
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
From 8581301957b0018a32433f85163535709bc9d332 Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Fri, 7 Oct 2022 21:25:07 -0700
Subject: [PATCH] try using a different group name

ref:
https://github.com/pytorch/pytorch/issues/85923#issuecomment-1272220271

Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>
---
functorch/test/conftest.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/functorch/test/conftest.py b/functorch/test/conftest.py
index d2e929a9a58db..afc39d9f35de9 100644
--- a/functorch/test/conftest.py
+++ b/functorch/test/conftest.py
@@ -17,7 +17,7 @@


def pytest_addoption(parser: Parser) -> None:
- group = parser.getgroup("terminal reporting")
+ group = parser.getgroup("terminal reporting functorch")
group.addoption(
"--junit-xml-reruns",
action="store",
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
`test_out` may fail due to slightly different values caused by different order of matrizes in SGEMM:

> Mismatched elements: 1 / 50 (2.0%)
> Greatest absolute difference: 1.430511474609375e-05 at index (4, 5) (up to 1e-05 allowed)
> Greatest relative difference: 4.65393206065873e-06 at index (4, 5) (up to 1.3e-06 allowed)

Author: Alexander Grund (TU Dresden)
Updated for PyTorch 1.13.1: Simon Branford (University of Birmingham)

--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -545,6 +545,9 @@
else list(supported_dtypes)[0]
)

+ if dtype is torch.float32:
+ self.precision, self.rel_tol = (1.5e-05, 1e-05)
+
samples = op.sample_inputs(device, dtype)
for sample in samples:
# calls it normally to get the expected result
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
Add missing headers to the installation which fixes e.g. test_cpp_extensions_aot_ninja
See https://github.com/pytorch/pytorch/pull/85547

Author: Alexander Grund (TU Dresden)
Updated for PyTorch 1.13.1: Simon Branford (University of Birmingham)

--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -56,7 +56,7 @@
EXCLUDE(ATen_CORE_TEST_SRCS "${ATen_CORE_TEST_SRCS}" ${ATen_CORE_EXCLUDED_TEST_SRCS})
endif()

-file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
+file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp" "functorch/*.cpp")
file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh")
file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp")
--- a/setup.py
+++ b/setup.py
@@ -1031,6 +1031,7 @@
'include/ATen/*.h',
'include/ATen/cpu/*.h',
'include/ATen/cpu/vec/vec256/*.h',
+ 'include/ATen/cpu/vec/vec256/vsx/*.h',
'include/ATen/cpu/vec/vec512/*.h',
'include/ATen/cpu/vec/*.h',
'include/ATen/core/*.h',
@@ -1138,6 +1139,7 @@
'include/THH/*.cuh',
'include/THH/*.h*',
'include/THH/generic/*.h',
+ 'include/sleef.h',
'share/cmake/ATen/*.cmake',
'share/cmake/Caffe2/*.cmake',
'share/cmake/Caffe2/public/*.cmake',
Loading