diff --git a/group_vars/hyperchicken_cluster/vars.yml b/group_vars/hyperchicken_cluster/vars.yml index e454cc95a..4288f9b65 100644 --- a/group_vars/hyperchicken_cluster/vars.yml +++ b/group_vars/hyperchicken_cluster/vars.yml @@ -3,7 +3,7 @@ slurm_cluster_name: 'hyperchicken' stack_domain: '' stack_name: "{{ slurm_cluster_name }}_cluster" # stack_name must match the name of the folder that contains this vars.yml file. stack_prefix: 'hc' -slurm_version: '20.11.8-1.el7.umcg' +slurm_version: '22.05.2-1.el7.umcg' slurm_partitions: - name: regular # Must be in sync with group listed in Ansible inventory. default: yes diff --git a/roles/slurm_client/tasks/main.yml b/roles/slurm_client/tasks/main.yml index 9e9d86c1b..a3a147a72 100644 --- a/roles/slurm_client/tasks/main.yml +++ b/roles/slurm_client/tasks/main.yml @@ -46,14 +46,23 @@ - 'restart_slurmd' become: true +- name: 'Delete deprecated/unused Slurm packages.' + ansible.builtin.yum: + state: 'removed' + name: + - slurm-libpmi + - slurm-openlava + - slurm-torque + become: true + - name: 'Install the Slurm client with yum.' ansible.builtin.yum: state: 'installed' update_cache: true allow_downgrade: true name: - - "slurm*{{ slurm_version }}" - - "slurm-slurmd*{{ slurm_version }}" + - "slurm-{{ slurm_version }}" + - "slurm-slurmd-{{ slurm_version }}" notify: - 'restart_munge' - 'restart_slurmd' @@ -64,7 +73,7 @@ ansible.builtin.lineinfile: path: "/usr/lib/systemd/system/{{ item }}.service" regexp: '^#?PIDFile=' - line: "#PIDFile=/var/run/slurm/{{ item }}.pid" + state: absent owner: root group: root mode: '0644' @@ -145,7 +154,7 @@ - name: 'Deploy slurm.conf.' ansible.builtin.template: - src: 'roles/slurm_management/templates/slurm.conf' + src: "roles/slurm_management/templates/slurm.conf.{{ slurm_version }}" dest: '/etc/slurm/slurm.conf' owner: 'root' group: 'root' @@ -156,7 +165,7 @@ - name: 'Configure cgroups.' ansible.builtin.copy: - src: 'roles/slurm_management/files/cgroup.conf' + src: "roles/slurm_management/files/cgroup.conf.{{ slurm_version }}" dest: '/etc/slurm/cgroup.conf' owner: 'root' group: 'root' diff --git a/roles/slurm_management/files/cgroup.conf b/roles/slurm_management/files/cgroup.conf.20.11.8-1.el7.umcg similarity index 100% rename from roles/slurm_management/files/cgroup.conf rename to roles/slurm_management/files/cgroup.conf.20.11.8-1.el7.umcg diff --git a/roles/slurm_management/files/cgroup.conf.22.05.2-1.el7.umcg b/roles/slurm_management/files/cgroup.conf.22.05.2-1.el7.umcg new file mode 100644 index 000000000..ee61c25b8 --- /dev/null +++ b/roles/slurm_management/files/cgroup.conf.22.05.2-1.el7.umcg @@ -0,0 +1,42 @@ +####################################################### +# +# Slurm cgroup support configuration file +# +# See man slurm.conf and man cgroup.conf for further +# information on cgroup configuration parameters +# +# Bind job tasks to a subset of the allocated cores using sched_setaffinity +# (which is enabled by TaskPlugin task/affinityin slurm.conf) +# to prevent them from swapping to other cores during job execution, +# which would decrease performance. +# Requires the Portable Hardware Locality (hwloc) library. +# +# NOTE: Due to bugs in hwloc: +# It is recommended to stack task/affinity,task/cgroup together when configuring TaskPlugin in slurm.conf +# combined with setting ConstrainCores=yes in cgroup.conf. +# This setup uses the task/affinity plugin for setting the affinity of the tasks +# (which is better and different than task/cgroup) +# and uses the task/cgroup plugin to fence tasks into the specified resources, +# thus combining the best of both pieces. +####################################################### + +CgroupAutomount=yes +ConstrainCores=yes +ConstrainDevices=yes +ConstrainRAMSpace=yes +ConstrainSWAPSpace=yes +# +# Lustre / GPFS / NFS clients or daemons tend to use large buffers allocated in kernel memory space. +# Those buffers count as kernel memory used by the cgroup of a job. +# Unaware users might have their jobs exceed this kernel memory limit simply by reading large files. +# The "plain RAM" limit is not affected by this option. +# Note: we limit the total amount of RAM that can be assigned to Slurm jobs to be less than the system total +# in the slurm.conf file to make sure the OS, file system daemons, etc. have enough RAM available. +# +ConstrainKmemSpace=no + +# +# Don't let Slurm jobs swap as swapping kills performance taking the H out of HPC. +# +AllowedSwapSpace=0 +MemorySwappiness=0 diff --git a/roles/slurm_management/tasks/main.yml b/roles/slurm_management/tasks/main.yml index e11a06d16..5cf5f12a2 100644 --- a/roles/slurm_management/tasks/main.yml +++ b/roles/slurm_management/tasks/main.yml @@ -89,10 +89,10 @@ update_cache: true allow_downgrade: true name: - - "slurm*{{ slurm_version }}" - - "slurm-slurmctld*{{ slurm_version }}" - - "slurm-slurmdbd*{{ slurm_version }}" - - "slurm-perlapi*{{ slurm_version }}" + - "slurm-{{ slurm_version }}" + - "slurm-slurmctld-{{ slurm_version }}" + - "slurm-slurmdbd-{{ slurm_version }}" + - "slurm-perlapi-{{ slurm_version }}" notify: - 'restart_munge' - 'restart_slurmdbd' @@ -156,7 +156,7 @@ - name: 'Install Slurm config file.' ansible.builtin.template: - src: 'templates/slurm.conf' + src: "templates/slurm.conf.{{ slurm_version }}" dest: '/etc/slurm/slurm.conf' owner: 'root' group: 'root' diff --git a/roles/slurm_management/templates/slurm.conf b/roles/slurm_management/templates/slurm.conf.20.11.8-1.el7.umcg similarity index 100% rename from roles/slurm_management/templates/slurm.conf rename to roles/slurm_management/templates/slurm.conf.20.11.8-1.el7.umcg diff --git a/roles/slurm_management/templates/slurm.conf.updated_for_slurm-21.08.x b/roles/slurm_management/templates/slurm.conf.22.05.2-1.el7.umcg similarity index 100% rename from roles/slurm_management/templates/slurm.conf.updated_for_slurm-21.08.x rename to roles/slurm_management/templates/slurm.conf.22.05.2-1.el7.umcg