Skip to content

Commit

Permalink
Merge pull request #3569 from Flamefire/cuda_cache
Browse files Browse the repository at this point in the history
create CUDA cache (for JIT compiled PTX code) in build dir instead of $HOME
  • Loading branch information
boegel authored Apr 5, 2021
2 parents aded444 + 01bcca0 commit aea60d7
Show file tree
Hide file tree
Showing 7 changed files with 114 additions and 2 deletions.
25 changes: 25 additions & 0 deletions easybuild/framework/easyblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -1034,6 +1034,27 @@ def make_dir(self, dir_name, clean, dontcreateinstalldir=False):

mkdir(dir_name, parents=True)

def set_up_cuda_cache(self):
"""Set up CUDA PTX cache."""

cuda_cache_maxsize = build_option('cuda_cache_maxsize')
if cuda_cache_maxsize is None:
cuda_cache_maxsize = 1 * 1024 # 1 GiB default value
else:
cuda_cache_maxsize = int(cuda_cache_maxsize)

if cuda_cache_maxsize == 0:
self.log.info("Disabling CUDA PTX cache since cache size was set to zero")
env.setvar('CUDA_CACHE_DISABLE', '1')
else:
cuda_cache_dir = build_option('cuda_cache_dir')
if not cuda_cache_dir:
cuda_cache_dir = os.path.join(self.builddir, 'eb-cuda-cache')
self.log.info("Enabling CUDA PTX cache of size %s MiB at %s", cuda_cache_maxsize, cuda_cache_dir)
env.setvar('CUDA_CACHE_DISABLE', '0')
env.setvar('CUDA_CACHE_PATH', cuda_cache_dir)
env.setvar('CUDA_CACHE_MAXSIZE', str(cuda_cache_maxsize * 1024 * 1024))

#
# MODULE UTILITY FUNCTIONS
#
Expand Down Expand Up @@ -2163,6 +2184,10 @@ def prepare_step(self, start_dir=True, load_tc_deps_modules=True):
self.log.info("Loading extra modules: %s", extra_modules)
self.modules_tool.load(extra_modules)

# Setup CUDA cache if required. If we don't do this, CUDA will use the $HOME for its cache files
if get_software_root('CUDA') or get_software_root('CUDAcore'):
self.set_up_cuda_cache()

# guess directory to start configure/build/install process in, and move there
if start_dir:
self.guess_start_dir()
Expand Down
2 changes: 2 additions & 0 deletions easybuild/tools/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX):
'container_image_name',
'container_template_recipe',
'container_tmpdir',
'cuda_cache_dir',
'cuda_cache_maxsize',
'cuda_compute_capabilities',
'download_timeout',
'dump_test_report',
Expand Down
5 changes: 5 additions & 0 deletions easybuild/tools/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,11 @@ def override_options(self):
'consider-archived-easyconfigs': ("Also consider archived easyconfigs", None, 'store_true', False),
'containerize': ("Generate container recipe/image", None, 'store_true', False, 'C'),
'copy-ec': ("Copy specified easyconfig(s) to specified location", None, 'store_true', False),
'cuda-cache-dir': ("Path to CUDA cache dir to use if enabled. Defaults to a path inside the build dir.",
str, 'store', None, {'metavar': "PATH"}),
'cuda-cache-maxsize': ("Maximum size of the CUDA cache (in MiB) used for JIT compilation of PTX code. "
"Leave value empty to let EasyBuild choose a value or '0' to disable the cache",
int, 'store_or_None', None),
'cuda-compute-capabilities': ("List of CUDA compute capabilities to use when building GPU software; "
"values should be specified as digits separated by a dot, "
"for example: 3.5,5.0,7.2", 'strlist', 'extend', None),
Expand Down
54 changes: 54 additions & 0 deletions test/framework/easyblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -1894,6 +1894,60 @@ def test_prepare_step_hmns(self):
self.assertEqual(len(loaded_modules), 1)
self.assertEqual(loaded_modules[0]['mod_name'], 'GCC/6.4.0-2.28')

def test_prepare_step_cuda_cache(self):
"""Test handling cuda-cache-* options."""

init_config(build_options={'cuda_cache_maxsize': None}) # Automatic mode

test_ecs = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'easyconfigs', 'test_ecs')
toy_ec = os.path.join(test_ecs, 't', 'toy', 'toy-0.0.eb')
ec = process_easyconfig(toy_ec)[0]
eb = EasyBlock(ec['ec'])
eb.silent = True
eb.make_builddir()

eb.prepare_step(start_dir=False)
logtxt = read_file(eb.logfile)
self.assertNotIn('Disabling CUDA PTX cache', logtxt)
self.assertNotIn('Enabling CUDA PTX cache', logtxt)

# Now with CUDA
test_ec = os.path.join(self.test_prefix, 'test.eb')
test_ectxt = re.sub('^toolchain = .*', "toolchain = {'name': 'gcccuda', 'version': '2018a'}",
read_file(toy_ec), flags=re.M)
write_file(test_ec, test_ectxt)
ec = process_easyconfig(test_ec)[0]
eb = EasyBlock(ec['ec'])
eb.silent = True
eb.make_builddir()

write_file(eb.logfile, '')
eb.prepare_step(start_dir=False)
logtxt = read_file(eb.logfile)
self.assertNotIn('Disabling CUDA PTX cache', logtxt)
self.assertIn('Enabling CUDA PTX cache', logtxt)
self.assertEqual(os.environ['CUDA_CACHE_DISABLE'], '0')

init_config(build_options={'cuda_cache_maxsize': 0}) # Disable
write_file(eb.logfile, '')
eb.prepare_step(start_dir=False)
logtxt = read_file(eb.logfile)
self.assertIn('Disabling CUDA PTX cache', logtxt)
self.assertNotIn('Enabling CUDA PTX cache', logtxt)
self.assertEqual(os.environ['CUDA_CACHE_DISABLE'], '1')

# Specified size and location
cuda_cache_dir = os.path.join(self.test_prefix, 'custom-cuda-cache')
init_config(build_options={'cuda_cache_maxsize': 1234, 'cuda_cache_dir': cuda_cache_dir})
write_file(eb.logfile, '')
eb.prepare_step(start_dir=False)
logtxt = read_file(eb.logfile)
self.assertNotIn('Disabling CUDA PTX cache', logtxt)
self.assertIn('Enabling CUDA PTX cache', logtxt)
self.assertEqual(os.environ['CUDA_CACHE_DISABLE'], '0')
self.assertEqual(os.environ['CUDA_CACHE_MAXSIZE'], str(1234 * 1024 * 1024))
self.assertEqual(os.environ['CUDA_CACHE_PATH'], cuda_cache_dir)

def test_checksum_step(self):
"""Test checksum step"""
testdir = os.path.abspath(os.path.dirname(__file__))
Expand Down
2 changes: 1 addition & 1 deletion test/framework/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@


# number of modules included for testing purposes
TEST_MODULES_COUNT = 81
TEST_MODULES_COUNT = 82


class ModulesTest(EnhancedTestCase):
Expand Down
26 changes: 26 additions & 0 deletions test/framework/modules/gcccuda/2018a
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#%Module

proc ModulesHelp { } {
puts stderr { GCC based compiler toolchain with CUDA support, and including
OpenMPI for MPI support, OpenBLAS (BLAS and LAPACK support), FFTW and ScaLAPACK. - Homepage: (none)
}
}

module-whatis {GNU Compiler Collection (GCC) based compiler toolchain, along with CUDA toolkit. - Homepage: (none)}

set root /prefix/software/gcccuda/2018a

conflict gcccuda

if { ![is-loaded GCC/6.4.0-2.28] } {
module load GCC/6.4.0-2.28
}

if { ![is-loaded CUDA/9.1.85] } {
module load CUDA/9.1.85
}


setenv EBROOTGCCCUDA "$root"
setenv EBVERSIONGCCCUDA "2018a"
setenv EBDEVELGCCCUDA "$root/easybuild/gcccuda-2018a-easybuild-devel"
2 changes: 1 addition & 1 deletion test/framework/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -4666,7 +4666,7 @@ def test_modules_tool_vs_syntax_check(self):
regex = re.compile(pattern, re.M)
self.assertTrue(regex.search(stdout), "Pattern '%s' found in: %s" % (regex.pattern, stdout))

def test_prefix(self):
def test_prefix_option(self):
"""Test which configuration settings are affected by --prefix."""
txt, _ = self._run_mock_eb(['--show-full-config', '--prefix=%s' % self.test_prefix], raise_error=True)

Expand Down

0 comments on commit aea60d7

Please sign in to comment.