Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Critical bug fix for slurm prolog should be added to master for new release too. #425

Merged
merged 3 commits into from
Jun 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions group_vars/talos_cluster/vars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,23 +35,23 @@ additional_etc_hosts:
vcompute_hostnames: "{{ stack_prefix }}-vcompute[01-03]"
vcompute_sockets: 4
vcompute_cores_per_socket: 1
vcompute_real_memory: 7821
vcompute_real_memory: 7820
vcompute_max_cpus_per_node: "{{ vcompute_sockets * vcompute_cores_per_socket - 2 }}"
vcompute_max_mem_per_node: "{{ vcompute_real_memory - vcompute_sockets * vcompute_cores_per_socket * 512 }}"
vcompute_local_disk: 270000
vcompute_features: 'tmp08'
vcompute_ethernet_interfaces:
- 'eth0'
- 'eth1'
- 'vlan983'
- 'vlan985.isilon'
ui_hostnames: "{{ slurm_cluster_name }}"
ui_sockets: 4
ui_cores_per_socket: 1
ui_real_memory: 7821
ui_real_memory: 7820
ui_local_disk: 0
ui_features: 'prm08,tmp08'
ui_ethernet_interfaces:
- 'eth0'
- 'eth1'
- 'vlan983'
- 'vlan985.isilon'
ssh_host_signer_ca_private_key: "{{ ssh_host_signer_ca_keypair_dir }}/umcg-hpc-development-ca"
use_ldap: yes
create_ldap: no
Expand Down
21 changes: 1 addition & 20 deletions roles/slurm_management/files/slurm.prolog
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
if [[ -z "${SLURM_JOB_ID}" ]]; then
logger -s "FATAL: SLURM_JOB_ID is empty or unset in SLURM prolog."
exit 1
elif [[ -z "${SLURM_JOB_QOS}" ]]; then
logger -s "FATAL: SLURM_JOB_QOS is empty or unset in SLURM prolog."
exit 1
#else
# logger -s "DEBUG: Found SLURM_JOB_ID ${SLURM_JOB_ID} and SLURM_JOB_QOS ${SLURM_JOB_QOS} in SLURM prolog."
fi
Expand All @@ -18,23 +15,7 @@ LOCAL_SCRATCH_DIR='/local'
# Check if local scratch dir is mountpoint and hence not a dir on the system disk.
#
if [[ $(stat -c '%d' "${LOCAL_SCRATCH_DIR}") -eq $(stat -c '%d' "${LOCAL_SCRATCH_DIR}/..") ]]; then
if [[ "${SLURM_JOB_QOS}" =~ ^ds.* ]]; then
#
# For the data staging QoS "ds", which executes jobs only on the UI,
# a dedicated tmp dir per job may be absent as not all UIs have a /local mount.
#
logger -s "WARN: local scratch disk (${LOCAL_SCRATCH_DIR}) is not mounted."
else
#
# Make sure we can create tmp dirs in /local on compute nodes.
# When this fails the job must not continue as SLURM will default to /tmp,
# which is not suitable for heavy random IO nor large data sets.
# Hammering /tmp may effectively result in the node going down.
# When the prolog fails the node will be set to state=DRAIN instead.
#
logger -s "FATAL: local scratch disk (${LOCAL_SCRATCH_DIR}) is not mounted."
exit 1
fi
logger -s "WARN: local scratch disk (${LOCAL_SCRATCH_DIR}) for Slurm jobs is not mounted/available."
else
#
# Create dedicated tmp dir for this job.
Expand Down