Skip to content

Commit

Permalink
merge with main
Browse files Browse the repository at this point in the history
  • Loading branch information
vsc46128 vscuser committed Dec 11, 2024
2 parents cb3d24c + 061bfa1 commit 5cb8d53
Show file tree
Hide file tree
Showing 23 changed files with 329 additions and 199 deletions.
1 change: 1 addition & 0 deletions config/aws_mc.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@
# All should _at least_ have this amount (30GB * 1E9 / (1024*1024) = 28610 MiB)
'mem_per_node': 28610
},
'max_jobs': 1,
}
for system in site_configuration['systems']:
for partition in system['partitions']:
Expand Down
7 changes: 7 additions & 0 deletions config/azure_mc.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@
'access': ['--partition=x86-64-amd-zen4-node', '--export=NONE'],
'descr': 'Zen4, 16 cores, 30 GB',
'prepare_cmds': [
# Avoid
# https://www.eessi.io/docs/known_issues/eessi-2023.06/#eessi-production-repository-v202306
'export OMPI_MCA_btl=^uct,ofi'
'export OMPI_MCA_pml=ucx'
'export OMPI_MCA_mtl=^ofi'
# Use override to avoid fallback to zen3
'export EESSI_SOFTWARE_SUBDIR_OVERRIDE=x86_64/amd/zen4',
common_eessi_init(),
# Required when using srun as launcher with --export=NONE in partition access,
Expand Down Expand Up @@ -100,6 +106,7 @@
'options': ['--mem={size}'],
}
],
'max_jobs': 1,
}
for system in site_configuration['systems']:
for partition in system['partitions']:
Expand Down
89 changes: 45 additions & 44 deletions config/izum_vega.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,50 +68,51 @@
},
'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/'
},
# {
# 'name': 'gpu',
# 'scheduler': 'slurm',
# 'prepare_cmds': [
# common_eessi_init(),
# # Pass job environment variables like $PATH, etc., into job steps
# 'export SLURM_EXPORT_ENV=ALL',
# # Needed when using srun launcher
# # 'export SLURM_MPI_TYPE=pmix', # WARNING: this broke the GROMACS on Vega
# # Avoid https://github.com/EESSI/software-layer/issues/136
# # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1)
# 'export OMPI_MCA_pml=ucx',
# ],
# 'launcher': 'mpirun',
# # Use --export=None to avoid that login environment is passed down to submitted jobs
# 'access': ['-p gpu', '--export=None'],
# 'environs': ['default'],
# 'max_jobs': 60,
# 'devices': [
# {
# 'type': DEVICE_TYPES[GPU],
# 'num_devices': 4,
# }
# ],
# 'resources': [
# {
# 'name': '_rfm_gpu',
# 'options': ['--gpus-per-node={num_gpus_per_node}'],
# },
# {
# 'name': 'memory',
# 'options': ['--mem={size}'],
# }
# ],
# 'features': [
# FEATURES[GPU],
# ] + list(SCALES.keys()),
# 'extras': {
# # Make sure to round down, otherwise a job might ask for more mem than is available
# # per node
# 'mem_per_node': 476.837 * 1024 # in MiB (should be checked, its unclear from slurm.conf)
# },
# 'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/'
# },
{
'name': 'gpu',
'scheduler': 'slurm',
'prepare_cmds': [
common_eessi_init(),
# Pass job environment variables like $PATH, etc., into job steps
'export SLURM_EXPORT_ENV=ALL',
# Needed when using srun launcher
# 'export SLURM_MPI_TYPE=pmix', # WARNING: this broke the GROMACS on Vega
# Avoid https://github.com/EESSI/software-layer/issues/136
# Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1)
'export OMPI_MCA_pml=ucx',
],
'launcher': 'mpirun',
# Use --export=None to avoid that login environment is passed down to submitted jobs
'access': ['-p gpu', '--export=None'],
'environs': ['default'],
'max_jobs': 60,
'devices': [
{
'type': DEVICE_TYPES[GPU],
'num_devices': 4,
}
],
'resources': [
{
'name': '_rfm_gpu',
'options': ['--gpus-per-node={num_gpus_per_node}'],
},
{
'name': 'memory',
'options': ['--mem={size}'],
}
],
'features': [
FEATURES[GPU],
] + list(SCALES.keys()),
'extras': {
GPU_VENDOR: GPU_VENDORS[NVIDIA],
# Make sure to round down, otherwise a job might ask for more mem than is available
# per node
'mem_per_node': 476.837 * 1024 # in MiB (should be checked, its unclear from slurm.conf)
},
'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/'
},
]
},
],
Expand Down
57 changes: 20 additions & 37 deletions config/vsc_hortense.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,26 @@
# ReFrame configuration file for VSC Tier-1 Hortense
# https://docs.vscentrum.be/en/latest/gent/tier1_hortense.html
#
# authors: Samuel Moors (VUB-HPC), Kenneth Hoste (HPC-UGent)
import os
# authors: Samuel Moors (VUB-HPC), Kenneth Hoste (HPC-UGent), Lara Peeters (HPC-UGent)

# Use generated topology file by ReFrame for CPU partitions
# Cannot use autodetection untill new functionality of `sched_options` is part of
# the ReFrame release https://github.com/reframe-hpc/reframe/issues/2970

# Instructions on generating topology file
# ```
# module swap cluster/{partition}
# qsub -I -l nodes=1:ppn=all -l walltime=00:30:00
#
# python3 -m venv "$TMPDIR"/reframe_venv
# source "$TMPDIR"/reframe_venv/bin/activate
# python3 -m pip install --upgrade pip
# python3 -m pip install reframe-hpc=="4.6.2"
#
# mkdir -p ~/.reframe/topology/hortense-{partition_name}
# reframe --detect-host-topology \
# ~/.reframe/topology/hortense-{partition_name}/processor.json
# ```

from reframe.core.backends import register_launcher
from reframe.core.launchers import JobLauncher
Expand Down Expand Up @@ -50,13 +68,6 @@ def command(self, job):
'max_jobs': 20,
'launcher': launcher,
'modules': [mpi_module],
'processor': {
'num_cpus': 128,
'num_sockets': 2,
'num_cpus_per_socket': 64,
'num_cpus_per_core': 1,
'arch': 'zen2',
},
'resources': [
{
'name': 'memory',
Expand All @@ -82,13 +93,6 @@ def command(self, job):
'max_jobs': 20,
'launcher': launcher,
'modules': [mpi_module],
'processor': {
'num_cpus': 128,
'num_sockets': 2,
'num_cpus_per_socket': 64,
'num_cpus_per_core': 1,
'arch': 'zen2',
},
'resources': [
{
'name': 'memory',
Expand All @@ -114,13 +118,6 @@ def command(self, job):
'max_jobs': 20,
'launcher': launcher,
'modules': [mpi_module],
'processor': {
'num_cpus': 128,
'num_sockets': 2,
'num_cpus_per_socket': 64,
'num_cpus_per_core': 1,
'arch': 'zen3',
},
'resources': [
{
'name': 'memory',
Expand All @@ -146,13 +143,6 @@ def command(self, job):
'max_jobs': 20,
'launcher': launcher,
'modules': [mpi_module],
'processor': {
'num_cpus': 48,
'num_sockets': 2,
'num_cpus_per_socket': 24,
'num_cpus_per_core': 1,
'arch': 'zen2',
},
'features': [
FEATURES[GPU],
] + list(SCALES.keys()),
Expand Down Expand Up @@ -190,13 +180,6 @@ def command(self, job):
'max_jobs': 20,
'launcher': launcher,
'modules': [mpi_module],
'processor': {
'num_cpus': 48,
'num_sockets': 2,
'num_cpus_per_socket': 24,
'num_cpus_per_core': 1,
'arch': 'zen2',
},
'features': [
FEATURES[GPU],
] + list(SCALES.keys()),
Expand Down
75 changes: 66 additions & 9 deletions eessi/testsuite/eessi_mixin.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from reframe.core.builtins import parameter, run_after
from reframe.core.builtins import parameter, run_after, variable
from reframe.core.exceptions import ReframeFatalError
from reframe.core.pipeline import RegressionMixin
from reframe.utility.sanity import make_performance_function
import reframe.utility.sanity as sn

from eessi.testsuite import hooks
from eessi.testsuite.constants import DEVICE_TYPES, SCALES, COMPUTE_UNIT
from eessi.testsuite.constants import DEVICE_TYPES, SCALES, COMPUTE_UNIT, TAGS
from eessi.testsuite.utils import log
from eessi.testsuite import __version__ as testsuite_version


# Hooks from the Mixin class seem to be executed _before_ those of the child class
Expand All @@ -25,19 +28,29 @@ class EESSI_Mixin(RegressionMixin):
All EESSI tests should derive from this mixin class unless they have a very good reason not to.
To run correctly, tests inheriting from this class need to define variables and parameters that are used here.
That definition needs to be done 'on time', i.e. early enough in the execution of the ReFrame pipeline.
Here, we list which class attributes need to be defined by the child class, and by (the end of) what phase:
Here, we list which class attributes must be defined by the child class, and by (the end of) what phase:
- Init phase: device_type, scale, module_name
- Init phase: device_type, scale, module_name, bench_name (if bench_name_ci is set)
- Setup phase: compute_unit, required_mem_per_node
The child class may also overwrite the following attributes:
- Init phase: time_limit, measure_memory_usage
- Init phase: time_limit, measure_memory_usage, bench_name_ci
"""

# Set defaults for these class variables, can be overwritten by child class if desired
measure_memory_usage = False
measure_memory_usage = variable(bool, value=False)
scale = parameter(SCALES.keys())
bench_name = None
bench_name_ci = None

# Create ReFrame variables for logging runtime environment information
cvmfs_repo_name = variable(str, value='None')
cvmfs_software_subdir = variable(str, value='None')
full_modulepath = variable(str, value='None')

# Make sure the version of the EESSI test suite gets logged in the ReFrame report
eessi_testsuite_version = variable(str, value=testsuite_version)

# Note that the error for an empty parameter is a bit unclear for ReFrame 4.6.2, but that will hopefully improve
# see https://github.com/reframe-hpc/reframe/issues/3254
Expand Down Expand Up @@ -81,7 +94,7 @@ def validate_init(self):
for var in var_list:
if not hasattr(self, var):
msg = "The variable '%s' should be defined in any test class that inherits" % var
msg += " from EESSI_Mixin in the init phase (or earlier), but it wasn't"
msg += " from EESSI_Mixin before (or in) the init phase, but it wasn't"
raise ReframeFatalError(msg)

# Check that the value for these variables is valid,
Expand Down Expand Up @@ -113,21 +126,32 @@ def measure_mem_usage(self):
# instead of the @performance_function decorator
self.perf_variables['memory'] = make_performance_function(hooks.extract_memory_usage, 'MiB', self)

@run_after('init', always_last=True)
def set_tag_ci(self):
"Set CI tag if bench_name_ci and bench_name are set and are equal"
if self.bench_name_ci:
if not self.bench_name:
msg = "Attribute bench_name_ci is set, but bench_name is not set"
raise ReframeFatalError(msg)
if self.bench_name == self.bench_name_ci:
self.tags.add(TAGS['CI'])
log(f'tags set to {self.tags}')

@run_after('setup')
def validate_setup(self):
"""Check that all variables that have to be set for subsequent hooks in the setup phase have been set"""
var_list = ['compute_unit']
for var in var_list:
if not hasattr(self, var):
msg = "The variable '%s' should be defined in any test class that inherits" % var
msg += " from EESSI_Mixin in the setup phase (or earlier), but it wasn't"
msg += " from EESSI_Mixin before (or in) the setup phase, but it wasn't"
raise ReframeFatalError(msg)

# Check if mem_func was defined to compute the required memory per node as function of the number of
# tasks per node
if not hasattr(self, 'required_mem_per_node'):
msg = "The function 'required_mem_per_node' should be defined in any test class that inherits"
msg += " from EESSI_Mixin in the setup phase (or earlier), but it wasn't. Note that this function"
msg += " from EESSI_Mixin before (or in) the setup phase, but it wasn't. Note that this function"
msg += " can use self.num_tasks_per_node, as it will be called after that attribute"
msg += " has been set."
raise ReframeFatalError(msg)
Expand All @@ -151,3 +175,36 @@ def assign_tasks_per_compute_unit(self):
def request_mem(self):
"""Call hook to request the required amount of memory per node"""
hooks.req_memory_per_node(self, app_mem_req=self.required_mem_per_node())

@run_after('setup')
def log_runtime_info(self):
"""Log additional runtime information: which CVMFS repo was used (or if it was testing local software),
path to the modulefile, EESSI software subdir, EESSI testsuite version"""
self.postrun_cmds.append('echo "EESSI_CVMFS_REPO: $EESSI_CVMFS_REPO"')
self.postrun_cmds.append('echo "EESSI_SOFTWARE_SUBDIR: $EESSI_SOFTWARE_SUBDIR"')
if self.module_name:
# Get full modulepath
get_full_modpath = f'echo "FULL_MODULEPATH: $(module --location show {self.module_name})"'
self.postrun_cmds.append(get_full_modpath)

@run_after('run')
def extract_runtime_info_from_log(self):
"""Extracts the printed runtime info from the job log and logs it as reframe variables"""
if self.is_dry_run():
return

# If EESSI_CVMFS_REPO environment variable was set, extract it and store it in self.cvmfs_repo_name
repo_name = sn.extractall(r'EESSI_CVMFS_REPO: /cvmfs/(?P<repo>.*)$', f'{self.stagedir}/{self.stdout}',
'repo', str)
if repo_name:
self.cvmfs_repo_name = f'{repo_name}'

software_subdir = sn.extractall(r'EESSI_SOFTWARE_SUBDIR: (?P<subdir>.*)$',
f'{self.stagedir}/{self.stdout}', 'subdir', str)
if software_subdir:
self.cvmfs_software_subdir = f'{software_subdir}'

module_path = sn.extractall(r'FULL_MODULEPATH: (?P<modpath>.*)$', f'{self.stagedir}/{self.stdout}',
'modpath', str)
if module_path:
self.full_modulepath = f'{module_path}'
Empty file.
Empty file.
Empty file.
Loading

0 comments on commit 5cb8d53

Please sign in to comment.