diff --git a/group_vars/hyperchicken-cluster/secrets.yml b/group_vars/hyperchicken-cluster/secrets.yml index 6b72a27f5..0a83228a4 100644 --- a/group_vars/hyperchicken-cluster/secrets.yml +++ b/group_vars/hyperchicken-cluster/secrets.yml @@ -1,19 +1,24 @@ $ANSIBLE_VAULT;1.1;AES256 -65663166613837656436313139396532613838346234303835366338623938623737636435623030 -3438356330643166383735633363623965383233356336330a353233663163353661626564643338 -61653138373230343832383139386637643432376231343237613835363731373837353363636439 -6137646633303661370a653762326165343039346237353964383165323339653535333762643830 -61633863356234383233393630353065363130353333343532373238306538356435643264343938 -36396366303765373862343338363534343763336534626363633763386130613833353961346535 -34353539313066666463623961353134616333326538366235333831316565346266313933376466 -32313533623162353535633964346630336266636162323864656530343131343663303339646339 -36623866663661666533363861663033373439643634363136343032343436396637373965316666 -35343236616335313164396463363461636338633030363837616231303230393138303531343739 -61643238356537646634343563323066396636323339396538313338346666386130636537343435 -31626133306530306566323137323361653065356437613937643830386330636361653935613961 -33373461363361316534353266613165373066633963363837366332326234663830353835646337 -36646263323234303261663861623139316131663763616134326263656236356165383333663564 -32306562366235386431623232336165653135376364323365353636373932323330306136656563 -39383038393133616236366137663035303863333836626462363836343538363438653633666264 -66643130313061633930303437353166653130356235356439373766313336363539376233323733 -3932323864356263353166353465623931666438323631303065 +31393061613237626164653430373033373161326436303631656664303331306432393731623235 +3434666363393461383632633061643438333731353537380a303035663830336232656533376138 +62393332393737316265373937666539306532303334636463316437353431343035663634663365 +6233313561383161640a313431386262373064656131396163336633616435316136316461383564 +63663237313861316665633130366163393139373238373965333139373439393261303233613030 +36633430333732303939316236326637353038666239333236353633643365353665383666323032 +33323137653538376166343464656162663062366637333136383662613764346363643063346164 +65373135643938636662613337376531353937363665326466396464393961386162646532306631 +38393838346536613631363138333832653830396538306634633138613936393430343431323662 +64393730646435653064343830653836333863623133313462633435386165373033306635656536 +34363230396262613765346438396633373761356666306661326565646330663331636165653161 +62303534326535393966306338316538333338643464343731663766656333623463393632646433 +36333364646531333337353932663061656433383631353265363263326461333631613032643533 +63636665643236343136633435623864363562303833633338376630633237343531343837303437 +33663663356337333230343633346535633138356432613234353335623933353735383031323363 +39363661383837323637636665643530393765353061356133396531656261323631343761393761 +39623664373934623565613165356534626362643266323535303566613430363539353064393933 +37396336636261363530316261616237373533313766326166313030373838626436303737636337 +64373031633966306639643136616264626664303934663066373062366537363534386232386161 +61613737363837336132323337376362376266356661396536633939623834356162333161666538 +65613639343961386465666637653530623833643634363437336333323131363034616436623661 +63393863626638366537383064393463616163376130643137353365663963616566336139343630 +626539393862656366663266643631393765 diff --git a/group_vars/hyperchicken-cluster/vars.yml b/group_vars/hyperchicken-cluster/vars.yml index 07ca48ba9..20e642bd0 100644 --- a/group_vars/hyperchicken-cluster/vars.yml +++ b/group_vars/hyperchicken-cluster/vars.yml @@ -6,20 +6,20 @@ mailhub: '192.168.0.5' rewrite_domain: "{{ stack_prefix }}-sai.{{ slurm_cluster_domain }}" motd: "To solve or not to solve, that's the question." vcompute_hostnames: "{{ stack_prefix }}-vcompute[01-05]" -vcompute_sockets: 1 -vcompute_cores_per_socket: 9 -vcompute_real_memory: 20000 +vcompute_sockets: 16 +vcompute_cores_per_socket: 1 +vcompute_real_memory: 64000 vcompute_max_cpus_per_node: "{{ vcompute_sockets * vcompute_cores_per_socket - 2 }}" vcompute_max_mem_per_node: "{{ vcompute_real_memory - vcompute_sockets * vcompute_cores_per_socket * 512 }}" vcompute_local_disk: 0 vcompute_features: 'tmp07' ui_hostnames: "{{ slurm_cluster_name }}" -ui_sockets: 1 +ui_sockets: 4 ui_cores_per_socket: 1 -ui_real_memory: 3000 +ui_real_memory: 7800 ui_local_disk: 0 ui_features: 'prm07,tmp07' -ssh_host_signer_ca_private_key: "{{ ssh_host_signer_ca_keypair_dir }}/umcg-hpc-ca" +ssh_host_signer_ca_private_key: "{{ ssh_host_signer_ca_keypair_dir }}/umcg-hpc-development-ca" key_name: Gerben image_cirros: cirros-0.3.4-x86_64-disk.img image_centos7: centos7 @@ -69,4 +69,24 @@ local_regular_users: groups: ['users', 'depad'] - user: 'rkanninga' groups: ['users', 'depad'] +pfs_mounts: [ + { pfs: 'Solve-RD', + source: 'em-isi-3126.ebi.ac.uk:/ifs/Solve-RD', + type: 'nfs4', + rw_options: 'defaults,_netdev,vers=4.0,noatime,nodiratime', + ro_options: 'defaults,_netdev,vers=4.0,noatime,nodiratime,ro' }, +] +lfs_mounts: [ + { lfs: 'home', + pfs: 'Solve-RD' }, + { lfs: 'groups/GROUP/tmp01', + pfs: 'Solve-RD', + groups: ['umcg-atd', 'Solve-RD'] }, + { lfs: 'groups/GROUP/prm01', + pfs: 'Solve-RD', + groups: ['umcg-atd', 'Solve-RD'] }, + { lfs: 'env08', + pfs: 'Solve-RD', + machines: "{{ groups['compute-vm'] + groups['user-interface'] }}" }, +] ... diff --git a/roles/slurm-client/tasks/main.yml b/roles/slurm-client/tasks/main.yml index 01fa54fef..4991cba67 100644 --- a/roles/slurm-client/tasks/main.yml +++ b/roles/slurm-client/tasks/main.yml @@ -101,13 +101,23 @@ mode: 0600 dest: /etc/munge/munge.key -- name: Deploy nhc.conf +- name: Deploy UI nhc.conf template: - src: templates/nhc.conf + src: templates/user-interface_nhc.conf dest: /etc/nhc/nhc.conf owner: root group: root mode: 0644 + when: inventory_hostname in groups['user-interface'] + +- name: Deploy compute-vm nhc.conf + template: + src: templates/compute-vm_nhc.conf + dest: /etc/nhc/nhc.conf + owner: root + group: root + mode: 0644 + when: inventory_hostname in groups['compute-vm'] - name: Start slurm and munge services systemd: diff --git a/roles/slurm-client/templates/compute-vm_nhc.conf b/roles/slurm-client/templates/compute-vm_nhc.conf new file mode 100644 index 000000000..aecb8a4ac --- /dev/null +++ b/roles/slurm-client/templates/compute-vm_nhc.conf @@ -0,0 +1,150 @@ +# NHC Configuration File +# +# Lines are in the form "||" +# Hostmask is a glob, /regexp/, or {noderange} +# Comments begin with '#' +# + + +####################################################################### +### +### NHC Configuration Variables +### +# Explicitly instruct NHC to assume PBS (TORQUE, PBSPro) is the Resource Manager + * || export NHC_RM=slurm + +# Do not mark nodes offline +# * || export MARK_OFFLINE=0 + +# Activate debugging mode +# * || export DEBUG=1 + +# Set watchdog timer to 15 seconds +# * || export TIMEOUT=15 + +# In out-of-band contexts, enable all checks +# * || export NHC_CHECK_ALL=1 + +# Run df only for local file systems of type ext4. +# This prevents running df for large shared file systems resulting in +# "NHC: Watchdog timer unable to terminate hung NHC process" errors. + * || export DFI_FLAGS='-Tiltext4' + * || export DF_FLAGS='-Tkltext4' + +# Use short hostname instead of FQDN to mark nodes online/offline. +# This prevents the +# "Not sure how to handle node state "" on ..." +# error when machines have FQDN hostnames, but are listed with short names in the SLURM config. + * || HOSTNAME="$HOSTNAME_S" + +####################################################################### +### +### Hardware checks +### +# Set these to your correct socket, core, and thread counts. +# * || check_hw_cpuinfo {{ vcompute_sockets }} {{ vcompute_sockets * vcompute_cores_per_socket }} {{ vcompute_sockets * vcompute_cores_per_socket }} + +# Set these to the amount of physical RAM you have (leave the fudge factor). +# * || check_hw_physmem {{ vcompute_real_memory }} {{ vcompute_real_memory }} 15% + +# Check specifically for free physical memory. +# * || check_hw_physmem_free 1MB + +# Check for some sort of free memory of either type. +# * || check_hw_mem_free 2GB + +# Checks for an active ethernet interface named "eth0." +# * || check_hw_eth eth0 + +# Checks for an active ethernet interface named "eth1." +# * || check_hw_eth eth1 + +# Checks for an active ethernet interface named "eth2." +# * || check_hw_eth eth2 + +# Check the mcelog daemon for any pending errors. +# * || check_hw_mcelog + + +####################################################################### +### +### Filesystem checks +### +# All nodes should have their root filesystem mounted read/write. + * || check_fs_mount_rw -f / + +# All nodes should have their /local filesystem mounted read/write. + * || check_fs_mount_rw -f /local + +# Controlling TTYs are a good thing! + * || check_fs_mount_rw -t devpts -s '/(none|devpts)/' -f /dev/pts + +# Make sure the root filesystem doesn't get too full. + * || check_fs_free / 10% + +# Make sure the root filesystem has enough free inodes. + * || check_fs_ifree / 1k + +# Make sure the /var volume doesn't get too full. + * || check_fs_free /var 5% + +# Make sure the /var filesystem has enough free inodes. + * || check_fs_ifree /var 1k + +# The following illustrates how to assert an NFSv3 mount (or any other specific mount option). +# * || check_fs_mount -s bluearc0:/home -t nfs -o '/(^|,)vers=3(,|$)/' -f /home +#* || check_fs_mount -s gcc-storage001.stor.hpc.local:/ifs/rekencluster/umcgst10/home -t nfs -o '/(^|,)vers=4(,|$)/' -f /home +#* || check_fs_mount -s gcc-storage001.stor.hpc.local:/ifs/rekencluster/tmp01 -t nfs -o '/(^|,)vers=4(,|$)/' -f /mnt/tmp01 +#* || check_fs_mount -s gcc-storage001.stor.hpc.local:/ifs/rekencluster/umcgst10/.envsync/tmp01 /apps -t nfs -o '/(^|,)vers=4(,|$)/' -f /apps + +# All nodes should have their home filesystem mounted read/write +# and not on the local system disk, but from a shared storage system instead. + * || check_fs_mount_rw -f /home -t '/(gpfs|lustre|nfs)/' + +# All nodes should have their apps filesystem mounted read only +# and not on the local system disk, but from a shared storage system instead. + * || check_fs_mount_ro -f /apps -t '/(gpfs|lustre|nfs)/' + +# All nodes should have their tmp filesystems mounted read/write. +# We do not check the mounts for each and every group, because the list of groups changes regularly. +# Instead we check only the Analysis Team Development (atd) group: +# * || check_fs_mount_rw -f /groups/umcg-atd/tmp01 -t '/(gpfs|lustre|nfs)/' + + +####################################################################### +### +### File/metadata checks +### +# These should always be directories and always be read/write/execute and sticky. + * || check_file_test -r -w -x -d -k /tmp /var/tmp + +# These should always be readable and should never be empty. + * || check_file_test -r -s /etc/passwd /etc/group + +# Assert common properties for /dev/null (which occasionally gets clobbered). + * || check_file_test -c -r -w /dev/null /dev/zero + * || check_file_stat -m 0666 -u 0 -g 0 -t 1 -T 3 /dev/null + +# Make sure there's relatively recent activity from the syslog. + * || check_file_stat -n 7200 /var/log/messages + +# Validate a couple important accounts in the passwd file. + * || check_file_contents /etc/passwd "/^root:x:0:0:/" "sshd:*" + + +####################################################################### +### +### Process checks +### +# Everybody needs sshd running, right? But don't use -r (restart)! + * || check_ps_service -u root -S sshd + +# The cron daemon is another useful critter... + * || check_ps_service -r crond + +# This is only valid for RHEL6 and similar/newer systems. + * || check_ps_service -d rsyslogd -r rsyslog + +# Double your core count is a good rule of thumb for load average max. +# This should work if you place it after one of the check_hw_*() checks. + * || check_ps_loadavg $((2*HW_CORES)) diff --git a/roles/slurm-client/templates/user-interface_nhc.conf b/roles/slurm-client/templates/user-interface_nhc.conf new file mode 100644 index 000000000..ebf8e3e9a --- /dev/null +++ b/roles/slurm-client/templates/user-interface_nhc.conf @@ -0,0 +1,150 @@ +# NHC Configuration File +# +# Lines are in the form "||" +# Hostmask is a glob, /regexp/, or {noderange} +# Comments begin with '#' +# + + +####################################################################### +### +### NHC Configuration Variables +### +# Explicitly instruct NHC to assume PBS (TORQUE, PBSPro) is the Resource Manager + * || export NHC_RM=slurm + +# Do not mark nodes offline +# * || export MARK_OFFLINE=0 + +# Activate debugging mode +# * || export DEBUG=1 + +# Set watchdog timer to 15 seconds +# * || export TIMEOUT=15 + +# In out-of-band contexts, enable all checks +# * || export NHC_CHECK_ALL=1 + +# Run df only for local file systems of type ext4. +# This prevents running df for large shared file systems resulting in +# "NHC: Watchdog timer unable to terminate hung NHC process" errors. + * || export DFI_FLAGS='-Tiltext4' + * || export DF_FLAGS='-Tkltext4' + +# Use short hostname instead of FQDN to mark nodes online/offline. +# This prevents the +# "Not sure how to handle node state "" on ..." +# error when machines have FQDN hostnames, but are listed with short names in the SLURM config. + * || HOSTNAME="$HOSTNAME_S" + +####################################################################### +### +### Hardware checks +### +# Set these to your correct socket, core, and thread counts. + * || check_hw_cpuinfo {{ ui_sockets }} {{ ui_sockets * ui_cores_per_socket }} {{ ui_sockets * ui_cores_per_socket }} + +# Set these to the amount of physical RAM you have (leave the fudge factor). + * || check_hw_physmem {{ ui_real_memory }}MB {{ ui_real_memory }}MB 5% + +# Check specifically for free physical memory. + * || check_hw_physmem_free 1MB + +# Check for some sort of free memory of either type. + * || check_hw_mem_free 2GB + +# Checks for an active ethernet interface named "eth0." + * || check_hw_eth eth0 + +# Checks for an active ethernet interface named "eth1." + * || check_hw_eth eth1 + +# Checks for an active ethernet interface named "eth2." +# * || check_hw_eth eth2 + +# Check the mcelog daemon for any pending errors. + * || check_hw_mcelog + + +####################################################################### +### +### Filesystem checks +### +# All nodes should have their root filesystem mounted read/write. + * || check_fs_mount_rw -f / + +# All nodes should have their /local filesystem mounted read/write. +# * || check_fs_mount_rw -f /local + +# Controlling TTYs are a good thing! + * || check_fs_mount_rw -t devpts -s '/(none|devpts)/' -f /dev/pts + +# Make sure the root filesystem doesn't get too full. + * || check_fs_free / 10% + +# Make sure the root filesystem has enough free inodes. + * || check_fs_ifree / 1k + +# Make sure the /var volume doesn't get too full. + * || check_fs_free /var 5% + +# Make sure the /var filesystem has enough free inodes. + * || check_fs_ifree /var 1k + +# The following illustrates how to assert an NFSv3 mount (or any other specific mount option). +# * || check_fs_mount -s bluearc0:/home -t nfs -o '/(^|,)vers=3(,|$)/' -f /home +#* || check_fs_mount -s gcc-storage001.stor.hpc.local:/ifs/rekencluster/umcgst10/home -t nfs -o '/(^|,)vers=4(,|$)/' -f /home +#* || check_fs_mount -s gcc-storage001.stor.hpc.local:/ifs/rekencluster/tmp01 -t nfs -o '/(^|,)vers=4(,|$)/' -f /mnt/tmp01 +#* || check_fs_mount -s gcc-storage001.stor.hpc.local:/ifs/rekencluster/umcgst10/.envsync/tmp01 /apps -t nfs -o '/(^|,)vers=4(,|$)/' -f /apps + +# All nodes should have their home filesystem mounted read/write +# and not on the local system disk, but from a shared storage system instead. + * || check_fs_mount_rw -f /home -t '/(ext4|gpfs|lustre|nfs)/' + +# All nodes should have their apps filesystem mounted read only +# and not on the local system disk, but from a shared storage system instead. +# * || check_fs_mount_ro -f /apps -t '/(ext4|gpfs|lustre|nfs)/' + +# All nodes should have their tmp filesystems mounted read/write. +# We do not check the mounts for each and every group, because the list of groups changes regularly. +# Instead we check only the Analysis Team Development (atd) group: +# * || check_fs_mount_rw -f /groups/umcg-atd/tmp01 -t '/(ext4|gpfs|lustre|nfs)/' + + +####################################################################### +### +### File/metadata checks +### +# These should always be directories and always be read/write/execute and sticky. + * || check_file_test -r -w -x -d -k /tmp /var/tmp + +# These should always be readable and should never be empty. + * || check_file_test -r -s /etc/passwd /etc/group + +# Assert common properties for /dev/null (which occasionally gets clobbered). + * || check_file_test -c -r -w /dev/null /dev/zero + * || check_file_stat -m 0666 -u 0 -g 0 -t 1 -T 3 /dev/null + +# Make sure there's relatively recent activity from the syslog. + * || check_file_stat -n 7200 /var/log/messages + +# Validate a couple important accounts in the passwd file. + * || check_file_contents /etc/passwd "/^root:x:0:0:/" "sshd:*" + + +####################################################################### +### +### Process checks +### +# Everybody needs sshd running, right? But don't use -r (restart)! + * || check_ps_service -u root -S sshd + +# The cron daemon is another useful critter... + * || check_ps_service -r crond + +# This is only valid for RHEL6 and similar/newer systems. + * || check_ps_service -d rsyslogd -r rsyslog + +# Double your core count is a good rule of thumb for load average max. +# This should work if you place it after one of the check_hw_*() checks. + * || check_ps_loadavg $((2*HW_CORES))