From e30995ac6dd761577eabf7355b6a29396c19911b Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 8 Feb 2021 18:09:48 +0100 Subject: [PATCH 1/4] Create CUDA cache (for JIT compiled PTX code) in build dir instead of $HOME Add option to control the cache size and to disable it --- easybuild/framework/easyblock.py | 16 +++++++++ easybuild/tools/config.py | 1 + easybuild/tools/options.py | 3 ++ test/framework/easyblock.py | 51 ++++++++++++++++++++++++++++ test/framework/modules.py | 2 +- test/framework/modules/gcccuda/2018a | 26 ++++++++++++++ 6 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 test/framework/modules/gcccuda/2018a diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 776fedc96d..f3645ee78b 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -2150,6 +2150,22 @@ def prepare_step(self, start_dir=True, load_tc_deps_modules=True): self.log.info("Loading extra modules: %s", extra_modules) self.modules_tool.load(extra_modules) + # Setup CUDA cache if required. If we don't do this, CUDA will use the $HOME for its cache files + if get_software_root('CUDA') or get_software_root('CUDAcore'): + cuda_cache_maxsize = build_option('cuda_cache_maxsize') + if cuda_cache_maxsize is None: + cuda_cache_maxsize = 1 * 1024 * 1024 * 1024 # 1 GB default value + if cuda_cache_maxsize == 0: + self.log.info('Disabling CUDA PTX cache as per request') + env.setvar('CUDA_CACHE_DISABLE', '1') + else: + cuda_cache_dir = os.path.join(self.builddir, 'eb-cuda-cache') + self.log.info('Enabling CUDA PTX cache of size %s MB at %s', + cuda_cache_maxsize / 1024 / 1024, cuda_cache_dir) + env.setvar('CUDA_CACHE_DISABLE', '0') + env.setvar('CUDA_CACHE_PATH', cuda_cache_dir) + env.setvar('CUDA_CACHE_MAXSIZE', str(cuda_cache_maxsize)) + # guess directory to start configure/build/install process in, and move there if start_dir: self.guess_start_dir() diff --git a/easybuild/tools/config.py b/easybuild/tools/config.py index b97186f3c5..1d686871e7 100644 --- a/easybuild/tools/config.py +++ b/easybuild/tools/config.py @@ -169,6 +169,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX): 'container_image_name', 'container_template_recipe', 'container_tmpdir', + 'cuda_cache_maxsize', 'cuda_compute_capabilities', 'download_timeout', 'dump_test_report', diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py index a363cde325..8c7387ebf6 100644 --- a/easybuild/tools/options.py +++ b/easybuild/tools/options.py @@ -357,6 +357,9 @@ def override_options(self): 'consider-archived-easyconfigs': ("Also consider archived easyconfigs", None, 'store_true', False), 'containerize': ("Generate container recipe/image", None, 'store_true', False, 'C'), 'copy-ec': ("Copy specified easyconfig(s) to specified location", None, 'store_true', False), + 'cuda-cache-maxsize': ("Maximum size of the CUDA cache (in bytes) used for JIT compilation of PTX code. " + "Leave value empty to let EasyBuild choose a value or '0' to disable the cache", + int, 'store_or_None', None), 'cuda-compute-capabilities': ("List of CUDA compute capabilities to use when building GPU software; " "values should be specified as digits separated by a dot, " "for example: 3.5,5.0,7.2", 'strlist', 'extend', None), diff --git a/test/framework/easyblock.py b/test/framework/easyblock.py index 399a269597..8b8921fcac 100644 --- a/test/framework/easyblock.py +++ b/test/framework/easyblock.py @@ -1761,6 +1761,57 @@ def test_prepare_step_hmns(self): self.assertEqual(len(loaded_modules), 1) self.assertEqual(loaded_modules[0]['mod_name'], 'GCC/6.4.0-2.28') + def test_prepare_step_cuda_cache(self): + """Test handling cuda-cache-maxsize option.""" + + init_config(build_options={'cuda_cache_maxsize': None}) # Automatic mode + + test_ecs = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'easyconfigs', 'test_ecs') + toy_ec = os.path.join(test_ecs, 't', 'toy', 'toy-0.0.eb') + ec = process_easyconfig(toy_ec)[0] + eb = EasyBlock(ec['ec']) + eb.silent = True + eb.make_builddir() + + eb.prepare_step(start_dir=False) + logtxt = read_file(eb.logfile) + self.assertNotIn('Disabling CUDA PTX cache', logtxt) + self.assertNotIn('Enabling CUDA PTX cache', logtxt) + + # Now with CUDA + test_ec = os.path.join(self.test_prefix, 'test.eb') + test_ectxt = re.sub('^toolchain = .*', "toolchain = {'name': 'gcccuda', 'version': '2018a'}", + read_file(toy_ec), flags=re.M) + write_file(test_ec, test_ectxt) + ec = process_easyconfig(test_ec)[0] + eb = EasyBlock(ec['ec']) + eb.silent = True + eb.make_builddir() + + write_file(eb.logfile, '') + eb.prepare_step(start_dir=False) + logtxt = read_file(eb.logfile) + self.assertNotIn('Disabling CUDA PTX cache', logtxt) + self.assertIn('Enabling CUDA PTX cache', logtxt) + self.assertEqual(os.environ['CUDA_CACHE_DISABLE'], '0') + + init_config(build_options={'cuda_cache_maxsize': 0}) # Disable + write_file(eb.logfile, '') + eb.prepare_step(start_dir=False) + logtxt = read_file(eb.logfile) + self.assertIn('Disabling CUDA PTX cache', logtxt) + self.assertNotIn('Enabling CUDA PTX cache', logtxt) + self.assertEqual(os.environ['CUDA_CACHE_DISABLE'], '1') + + init_config(build_options={'cuda_cache_maxsize': 1234567890}) # Specified size + write_file(eb.logfile, '') + eb.prepare_step(start_dir=False) + logtxt = read_file(eb.logfile) + self.assertNotIn('Disabling CUDA PTX cache', logtxt) + self.assertIn('Enabling CUDA PTX cache', logtxt) + self.assertEqual(os.environ['CUDA_CACHE_DISABLE'], '0') + self.assertEqual(os.environ['CUDA_CACHE_MAXSIZE'], '1234567890') + def test_checksum_step(self): """Test checksum step""" testdir = os.path.abspath(os.path.dirname(__file__)) diff --git a/test/framework/modules.py b/test/framework/modules.py index f9b6ef6238..b1bea458ae 100644 --- a/test/framework/modules.py +++ b/test/framework/modules.py @@ -54,7 +54,7 @@ # number of modules included for testing purposes -TEST_MODULES_COUNT = 81 +TEST_MODULES_COUNT = 82 class ModulesTest(EnhancedTestCase): diff --git a/test/framework/modules/gcccuda/2018a b/test/framework/modules/gcccuda/2018a new file mode 100644 index 0000000000..f9779f1be5 --- /dev/null +++ b/test/framework/modules/gcccuda/2018a @@ -0,0 +1,26 @@ +#%Module + +proc ModulesHelp { } { + puts stderr { GCC based compiler toolchain with CUDA support, and including + OpenMPI for MPI support, OpenBLAS (BLAS and LAPACK support), FFTW and ScaLAPACK. - Homepage: (none) +} +} + +module-whatis {GNU Compiler Collection (GCC) based compiler toolchain, along with CUDA toolkit. - Homepage: (none)} + +set root /prefix/software/gcccuda/2018a + +conflict gcccuda + +if { ![is-loaded GCC/6.4.0-2.28] } { + module load GCC/6.4.0-2.28 +} + +if { ![is-loaded CUDA/9.1.85] } { + module load CUDA/9.1.85 +} + + +setenv EBROOTGCCCUDA "$root" +setenv EBVERSIONGCCCUDA "2018a" +setenv EBDEVELGCCCUDA "$root/easybuild/gcccuda-2018a-easybuild-devel" From 26b0ef014dec7f01c0dc8433065ca4b080d6dcb3 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Tue, 9 Feb 2021 12:18:29 +0100 Subject: [PATCH 2/4] Fix test_prefix_option not being run --- test/framework/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/framework/options.py b/test/framework/options.py index cbdd57cc71..46fa9f70b2 100644 --- a/test/framework/options.py +++ b/test/framework/options.py @@ -4430,7 +4430,7 @@ def test_modules_tool_vs_syntax_check(self): regex = re.compile(pattern, re.M) self.assertTrue(regex.search(stdout), "Pattern '%s' found in: %s" % (regex.pattern, stdout)) - def test_prefix(self): + def test_prefix_option(self): """Test which configuration settings are affected by --prefix.""" txt, _ = self._run_mock_eb(['--show-full-config', '--prefix=%s' % self.test_prefix], raise_error=True) From 2f079a6eccd8ea0afc792abfad0471251e755bcf Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 15 Feb 2021 11:51:44 +0100 Subject: [PATCH 3/4] Change unit of cuda-cache-size to MiB and add cuda_cache_dir --- easybuild/framework/easyblock.py | 32 +++++++++++++++++++------------- easybuild/tools/config.py | 1 + easybuild/tools/options.py | 4 +++- test/framework/easyblock.py | 9 ++++++--- 4 files changed, 29 insertions(+), 17 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index f3645ee78b..6ee0698904 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -1027,6 +1027,24 @@ def make_dir(self, dir_name, clean, dontcreateinstalldir=False): mkdir(dir_name, parents=True) + def setup_cuda_cache(self): + cuda_cache_maxsize = build_option('cuda_cache_maxsize') + if cuda_cache_maxsize is None: + cuda_cache_maxsize = 1 * 1024 # 1 GiB default value + else: + cuda_cache_maxsize = int(cuda_cache_maxsize) + if cuda_cache_maxsize == 0: + self.log.info('Disabling CUDA PTX cache as per request') + env.setvar('CUDA_CACHE_DISABLE', '1') + else: + cuda_cache_dir = build_option('cuda_cache_dir') + if not cuda_cache_dir: + cuda_cache_dir = os.path.join(self.builddir, 'eb-cuda-cache') + self.log.info('Enabling CUDA PTX cache of size %s MiB at %s', cuda_cache_maxsize, cuda_cache_dir) + env.setvar('CUDA_CACHE_DISABLE', '0') + env.setvar('CUDA_CACHE_PATH', cuda_cache_dir) + env.setvar('CUDA_CACHE_MAXSIZE', str(cuda_cache_maxsize * 1024 * 1024)) + # # MODULE UTILITY FUNCTIONS # @@ -2152,19 +2170,7 @@ def prepare_step(self, start_dir=True, load_tc_deps_modules=True): # Setup CUDA cache if required. If we don't do this, CUDA will use the $HOME for its cache files if get_software_root('CUDA') or get_software_root('CUDAcore'): - cuda_cache_maxsize = build_option('cuda_cache_maxsize') - if cuda_cache_maxsize is None: - cuda_cache_maxsize = 1 * 1024 * 1024 * 1024 # 1 GB default value - if cuda_cache_maxsize == 0: - self.log.info('Disabling CUDA PTX cache as per request') - env.setvar('CUDA_CACHE_DISABLE', '1') - else: - cuda_cache_dir = os.path.join(self.builddir, 'eb-cuda-cache') - self.log.info('Enabling CUDA PTX cache of size %s MB at %s', - cuda_cache_maxsize / 1024 / 1024, cuda_cache_dir) - env.setvar('CUDA_CACHE_DISABLE', '0') - env.setvar('CUDA_CACHE_PATH', cuda_cache_dir) - env.setvar('CUDA_CACHE_MAXSIZE', str(cuda_cache_maxsize)) + self.setup_cuda_cache() # guess directory to start configure/build/install process in, and move there if start_dir: diff --git a/easybuild/tools/config.py b/easybuild/tools/config.py index 1d686871e7..33efd1b99c 100644 --- a/easybuild/tools/config.py +++ b/easybuild/tools/config.py @@ -169,6 +169,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX): 'container_image_name', 'container_template_recipe', 'container_tmpdir', + 'cuda_cache_dir', 'cuda_cache_maxsize', 'cuda_compute_capabilities', 'download_timeout', diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py index 8c7387ebf6..69441614a6 100644 --- a/easybuild/tools/options.py +++ b/easybuild/tools/options.py @@ -357,7 +357,9 @@ def override_options(self): 'consider-archived-easyconfigs': ("Also consider archived easyconfigs", None, 'store_true', False), 'containerize': ("Generate container recipe/image", None, 'store_true', False, 'C'), 'copy-ec': ("Copy specified easyconfig(s) to specified location", None, 'store_true', False), - 'cuda-cache-maxsize': ("Maximum size of the CUDA cache (in bytes) used for JIT compilation of PTX code. " + 'cuda-cache-dir': ("Path to CUDA cache dir to use if enabled. Defaults to a path inside the build dir.", + str, 'store', None, {'metavar': "PATH"}), + 'cuda-cache-maxsize': ("Maximum size of the CUDA cache (in MiB) used for JIT compilation of PTX code. " "Leave value empty to let EasyBuild choose a value or '0' to disable the cache", int, 'store_or_None', None), 'cuda-compute-capabilities': ("List of CUDA compute capabilities to use when building GPU software; " diff --git a/test/framework/easyblock.py b/test/framework/easyblock.py index 8b8921fcac..9257cb22af 100644 --- a/test/framework/easyblock.py +++ b/test/framework/easyblock.py @@ -1762,7 +1762,7 @@ def test_prepare_step_hmns(self): self.assertEqual(loaded_modules[0]['mod_name'], 'GCC/6.4.0-2.28') def test_prepare_step_cuda_cache(self): - """Test handling cuda-cache-maxsize option.""" + """Test handling cuda-cache-* options.""" init_config(build_options={'cuda_cache_maxsize': None}) # Automatic mode @@ -1803,14 +1803,17 @@ def test_prepare_step_cuda_cache(self): self.assertNotIn('Enabling CUDA PTX cache', logtxt) self.assertEqual(os.environ['CUDA_CACHE_DISABLE'], '1') - init_config(build_options={'cuda_cache_maxsize': 1234567890}) # Specified size + # Specified size and location + cuda_cache_dir = os.path.join(self.test_prefix, 'custom-cuda-cache') + init_config(build_options={'cuda_cache_maxsize': 1234, 'cuda_cache_dir': cuda_cache_dir}) write_file(eb.logfile, '') eb.prepare_step(start_dir=False) logtxt = read_file(eb.logfile) self.assertNotIn('Disabling CUDA PTX cache', logtxt) self.assertIn('Enabling CUDA PTX cache', logtxt) self.assertEqual(os.environ['CUDA_CACHE_DISABLE'], '0') - self.assertEqual(os.environ['CUDA_CACHE_MAXSIZE'], '1234567890') + self.assertEqual(os.environ['CUDA_CACHE_MAXSIZE'], str(1234 * 1024 * 1024)) + self.assertEqual(os.environ['CUDA_CACHE_PATH'], cuda_cache_dir) def test_checksum_step(self): """Test checksum step""" From 01bcca03b55d5e44740405aa7bdd853c2b5571ce Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Mon, 5 Apr 2021 18:42:47 +0200 Subject: [PATCH 4/4] rename setup_cuda_cache method to set_up_cuda_cache + minor tweaks to log messages --- easybuild/framework/easyblock.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 28add7d1ec..7b952dc2c7 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -1034,20 +1034,23 @@ def make_dir(self, dir_name, clean, dontcreateinstalldir=False): mkdir(dir_name, parents=True) - def setup_cuda_cache(self): + def set_up_cuda_cache(self): + """Set up CUDA PTX cache.""" + cuda_cache_maxsize = build_option('cuda_cache_maxsize') if cuda_cache_maxsize is None: cuda_cache_maxsize = 1 * 1024 # 1 GiB default value else: cuda_cache_maxsize = int(cuda_cache_maxsize) + if cuda_cache_maxsize == 0: - self.log.info('Disabling CUDA PTX cache as per request') + self.log.info("Disabling CUDA PTX cache since cache size was set to zero") env.setvar('CUDA_CACHE_DISABLE', '1') else: cuda_cache_dir = build_option('cuda_cache_dir') if not cuda_cache_dir: cuda_cache_dir = os.path.join(self.builddir, 'eb-cuda-cache') - self.log.info('Enabling CUDA PTX cache of size %s MiB at %s', cuda_cache_maxsize, cuda_cache_dir) + self.log.info("Enabling CUDA PTX cache of size %s MiB at %s", cuda_cache_maxsize, cuda_cache_dir) env.setvar('CUDA_CACHE_DISABLE', '0') env.setvar('CUDA_CACHE_PATH', cuda_cache_dir) env.setvar('CUDA_CACHE_MAXSIZE', str(cuda_cache_maxsize * 1024 * 1024)) @@ -2183,7 +2186,7 @@ def prepare_step(self, start_dir=True, load_tc_deps_modules=True): # Setup CUDA cache if required. If we don't do this, CUDA will use the $HOME for its cache files if get_software_root('CUDA') or get_software_root('CUDAcore'): - self.setup_cuda_cache() + self.set_up_cuda_cache() # guess directory to start configure/build/install process in, and move there if start_dir: