From ecdef00f820525c7ea4abb209aa104b5b1e9ff78 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Tue, 2 Feb 2021 18:07:17 +0100 Subject: [PATCH 1/8] Add support for including PTX code in PyTorch This adds PTX code to PyTorch by default for any newer architecture than the last selected one. This can be changed by the new EC option "ptx" --- easybuild/easyblocks/p/pytorch.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index f06b26550e..165e0c6645 100644 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -51,7 +51,9 @@ def extra_options(): extra_vars.update({ 'excluded_tests': [{}, 'Mapping of architecture strings to list of tests to be excluded', CUSTOM], 'custom_opts': [[], 'List of options for the build/install command. Can be used to change the defaults ' + - 'set by the PyTorch EasyBlock, for example ["USE_MKLDNN=0"].', CUSTOM] + 'set by the PyTorch EasyBlock, for example ["USE_MKLDNN=0"].', CUSTOM], + 'ptx': ['latest', 'For which compute architectures PTX code should be generated. Can be ' + '"first", "latest", None or any PyTorch supported architecture, e.g. "3.7"', CUSTOM], }) extra_vars['download_dep_fail'][0] = True extra_vars['sanity_pip_check'][0] = True @@ -193,6 +195,16 @@ def configure_step(self): raise EasyBuildError('List of CUDA compute capabilities must be specified, either via ' 'cuda_compute_capabilities easyconfig parameter or via ' '--cuda-compute-capabilities') + ptx = self.cfg['ptx'] + if ptx == 'latest': + cuda_cc[-1] += '+PTX' + elif ptx == 'first': + cuda_cc[0] += '+PTX' + elif ptx is not None: + if ptx in cuda_cc: + cuda_cc.remove(ptx) + cuda_cc.append(ptx + '+PTX') + self.log.info('Compiling with specified list of CUDA compute capabilities: %s', ', '.join(cuda_cc)) options.append('TORCH_CUDA_ARCH_LIST="%s"' % ';'.join(cuda_cc)) else: From 3f374ab8f44e863cc0b61c8fc65a764736c5dabf Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Fri, 5 Feb 2021 09:25:16 +0100 Subject: [PATCH 2/8] Make sure TORCH_CUDA_ARCH_LIST is also set for tests --- easybuild/easyblocks/p/pytorch.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index 165e0c6645..443140a61b 100644 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -53,7 +53,7 @@ def extra_options(): 'custom_opts': [[], 'List of options for the build/install command. Can be used to change the defaults ' + 'set by the PyTorch EasyBlock, for example ["USE_MKLDNN=0"].', CUSTOM], 'ptx': ['latest', 'For which compute architectures PTX code should be generated. Can be ' - '"first", "latest", None or any PyTorch supported architecture, e.g. "3.7"', CUSTOM], + '"first", "latest", None or any PyTorch supported arch, e.g. "3.7"', CUSTOM], }) extra_vars['download_dep_fail'][0] = True extra_vars['sanity_pip_check'][0] = True @@ -206,7 +206,10 @@ def configure_step(self): cuda_cc.append(ptx + '+PTX') self.log.info('Compiling with specified list of CUDA compute capabilities: %s', ', '.join(cuda_cc)) - options.append('TORCH_CUDA_ARCH_LIST="%s"' % ';'.join(cuda_cc)) + # This variable is also used at runtime (e.g. for tests) and if it is not set PyTorch will automatically + # determine the compute capability of a GPU in the system and use that which may fail tests if + # it is to new for the used nvcc + env.setvar('TORCH_CUDA_ARCH_LIST', ';'.join(cuda_cc)) else: # Disable CUDA options.append('USE_CUDA=0') From afb5fa93ba55e1e0b4186e2c5ad6a41bd931a79a Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 8 Feb 2021 09:37:33 +0100 Subject: [PATCH 3/8] Replace latest by last --- easybuild/easyblocks/p/pytorch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index 443140a61b..658c0da0a5 100644 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -52,8 +52,8 @@ def extra_options(): 'excluded_tests': [{}, 'Mapping of architecture strings to list of tests to be excluded', CUSTOM], 'custom_opts': [[], 'List of options for the build/install command. Can be used to change the defaults ' + 'set by the PyTorch EasyBlock, for example ["USE_MKLDNN=0"].', CUSTOM], - 'ptx': ['latest', 'For which compute architectures PTX code should be generated. Can be ' - '"first", "latest", None or any PyTorch supported arch, e.g. "3.7"', CUSTOM], + 'ptx': ['last', 'For which compute architectures PTX code should be generated. Can be ' + '"first", "last", None or any PyTorch supported arch, e.g. "3.7"', CUSTOM], }) extra_vars['download_dep_fail'][0] = True extra_vars['sanity_pip_check'][0] = True @@ -196,7 +196,7 @@ def configure_step(self): 'cuda_compute_capabilities easyconfig parameter or via ' '--cuda-compute-capabilities') ptx = self.cfg['ptx'] - if ptx == 'latest': + if ptx == 'last': cuda_cc[-1] += '+PTX' elif ptx == 'first': cuda_cc[0] += '+PTX' From 9b9cc1ae4c196b51f1cb4c86b2f60997d3b7b2fd Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 8 Feb 2021 11:30:58 +0100 Subject: [PATCH 4/8] Add option to setup cuda cache --- easybuild/easyblocks/p/pytorch.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index 658c0da0a5..472397f372 100644 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -54,6 +54,12 @@ def extra_options(): 'set by the PyTorch EasyBlock, for example ["USE_MKLDNN=0"].', CUSTOM], 'ptx': ['last', 'For which compute architectures PTX code should be generated. Can be ' '"first", "last", None or any PyTorch supported arch, e.g. "3.7"', CUSTOM], + 'cuda_cache_maxsize': [ + None, + 'Maximum size of the cache (in bytes) used by CUDA for JIT compilation of PTX code. ' + 'Use "None" to let EasyBuild choose a value or "0" to disable the cache', + CUSTOM + ], }) extra_vars['download_dep_fail'][0] = True extra_vars['sanity_pip_check'][0] = True @@ -135,6 +141,17 @@ def prepare_step(self, *args, **kwargs): symlink(os.path.join(cmake_root, 'bin', 'cmake'), os.path.join(cmake_bin_dir, 'cmake3')) path = "%s:%s" % (cmake_bin_dir, os.getenv('PATH')) env.setvar('PATH', path) + if get_software_root('CUDA'): + cuda_cache_maxsize = self.cfg['cuda_cache_maxsize'] + if cuda_cache_maxsize is None: + cuda_cache_maxsize = 1 * 1024 * 1024 # 1 GB default value + if cuda_cache_maxsize == 0: + env.setvar('CUDA_CACHE_DISABLE', '1') + else: + cuda_cache_dir = tempfile.mkdtemp(suffix='-cuda_cache', dir=self.builddir) + env.setvar('CUDA_CACHE_DISABLE', '0') + env.setvar('CUDA_CACHE_PATH', cuda_cache_dir) + env.setvar('CUDA_CACHE_MAXSIZE', cuda_cache_maxsize) def configure_step(self): """Custom configure procedure for PyTorch.""" From e14e7e9694c04e78530b465137505e33f7c10864 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 8 Feb 2021 11:35:48 +0100 Subject: [PATCH 5/8] Actually use 1 GB default cache size --- easybuild/easyblocks/p/pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index 472397f372..c550fc31fd 100644 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -144,7 +144,7 @@ def prepare_step(self, *args, **kwargs): if get_software_root('CUDA'): cuda_cache_maxsize = self.cfg['cuda_cache_maxsize'] if cuda_cache_maxsize is None: - cuda_cache_maxsize = 1 * 1024 * 1024 # 1 GB default value + cuda_cache_maxsize = 1 * 1024 * 1024 * 1024 # 1 GB default value if cuda_cache_maxsize == 0: env.setvar('CUDA_CACHE_DISABLE', '1') else: From 8ee89663fe93e9f8996c510678a557b911fb8a36 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 8 Feb 2021 13:38:05 +0100 Subject: [PATCH 6/8] Env var must be a string --- easybuild/easyblocks/p/pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index c550fc31fd..bc4426d9a1 100644 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -151,7 +151,7 @@ def prepare_step(self, *args, **kwargs): cuda_cache_dir = tempfile.mkdtemp(suffix='-cuda_cache', dir=self.builddir) env.setvar('CUDA_CACHE_DISABLE', '0') env.setvar('CUDA_CACHE_PATH', cuda_cache_dir) - env.setvar('CUDA_CACHE_MAXSIZE', cuda_cache_maxsize) + env.setvar('CUDA_CACHE_MAXSIZE', str(cuda_cache_maxsize)) def configure_step(self): """Custom configure procedure for PyTorch.""" From 50304269e5e8cabdfc1852a6a88c3699ce64bfa3 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Tue, 9 Feb 2021 12:26:16 +0100 Subject: [PATCH 7/8] Move CUDA cache setting to framework --- easybuild/easyblocks/p/pytorch.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index bc4426d9a1..658c0da0a5 100644 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -54,12 +54,6 @@ def extra_options(): 'set by the PyTorch EasyBlock, for example ["USE_MKLDNN=0"].', CUSTOM], 'ptx': ['last', 'For which compute architectures PTX code should be generated. Can be ' '"first", "last", None or any PyTorch supported arch, e.g. "3.7"', CUSTOM], - 'cuda_cache_maxsize': [ - None, - 'Maximum size of the cache (in bytes) used by CUDA for JIT compilation of PTX code. ' - 'Use "None" to let EasyBuild choose a value or "0" to disable the cache', - CUSTOM - ], }) extra_vars['download_dep_fail'][0] = True extra_vars['sanity_pip_check'][0] = True @@ -141,17 +135,6 @@ def prepare_step(self, *args, **kwargs): symlink(os.path.join(cmake_root, 'bin', 'cmake'), os.path.join(cmake_bin_dir, 'cmake3')) path = "%s:%s" % (cmake_bin_dir, os.getenv('PATH')) env.setvar('PATH', path) - if get_software_root('CUDA'): - cuda_cache_maxsize = self.cfg['cuda_cache_maxsize'] - if cuda_cache_maxsize is None: - cuda_cache_maxsize = 1 * 1024 * 1024 * 1024 # 1 GB default value - if cuda_cache_maxsize == 0: - env.setvar('CUDA_CACHE_DISABLE', '1') - else: - cuda_cache_dir = tempfile.mkdtemp(suffix='-cuda_cache', dir=self.builddir) - env.setvar('CUDA_CACHE_DISABLE', '0') - env.setvar('CUDA_CACHE_PATH', cuda_cache_dir) - env.setvar('CUDA_CACHE_MAXSIZE', str(cuda_cache_maxsize)) def configure_step(self): """Custom configure procedure for PyTorch.""" From 9735206ae5aa94aff8a2ae886e99a2e8e76df029 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Thu, 11 Feb 2021 09:00:46 +0100 Subject: [PATCH 8/8] Don't modify original CCC list --- easybuild/easyblocks/p/pytorch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index 658c0da0a5..321fcc7602 100644 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -196,6 +196,7 @@ def configure_step(self): 'cuda_compute_capabilities easyconfig parameter or via ' '--cuda-compute-capabilities') ptx = self.cfg['ptx'] + cuda_cc = cuda_cc[:] # Don't modify original list if ptx == 'last': cuda_cc[-1] += '+PTX' elif ptx == 'first':