Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nibbler updates for new Lustre file system and vlan1068. #707

Merged
merged 13 commits into from
Jan 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion group_vars/nibbler_cluster/ip_addresses.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ ip_addresses:
nb_internal_storage:
address: 10.10.2.61
netmask: /32
vlan1068:
address: 172.23.60.3
netmask: /32
nb-repo:
nb_internal_management:
address: 10.10.1.56
Expand All @@ -44,6 +47,9 @@ ip_addresses:
nb_internal_storage:
address: 10.10.2.68
netmask: /32
vlan1068:
address: 172.23.60.7
netmask: /32
nb-transfer:
nb_internal_management:
address: 10.10.1.12
Expand All @@ -60,34 +66,49 @@ ip_addresses:
nb_internal_storage:
address: 10.10.2.190
netmask: /32
vlan1068:
address: 172.23.60.6
netmask: /32
nb-vcompute02:
nb_internal_management:
address: 10.10.1.246
netmask: /32
nb_internal_storage:
address: 10.10.2.152
netmask: /32
vlan1068:
address: 172.23.60.29
netmask: /32
nb-vcompute03:
nb_internal_management:
address: 10.10.1.32
netmask: /32
nb_internal_storage:
address: 10.10.2.220
netmask: /32
vlan1068:
address: 172.23.60.15
netmask: /32
nb-vcompute04:
nb_internal_management:
address: 10.10.1.197
netmask: /32
nb_internal_storage:
address: 10.10.2.129
netmask: /32
vlan1068:
address: 172.23.60.22
netmask: /32
nb-vcompute05:
nb_internal_management:
address: 10.10.1.229
netmask: /32
nb_internal_storage:
address: 10.10.2.39
netmask: /32
vlan1068:
address: 172.23.60.14
netmask: /32
nibbler:
nb_internal_management:
address: 10.10.1.112
Expand All @@ -96,7 +117,7 @@ ip_addresses:
address: 10.10.2.87
netmask: /32
vlan1068:
address: 172.23.60.10
address: 172.23.60.4
netmask: /32
tunnel:
nb_internal_management:
Expand Down
152 changes: 43 additions & 109 deletions group_vars/nibbler_cluster/vars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ slurm_partitions:
extra_options: 'TRESBillingWeights="CPU=1.0,Mem=0.5G" DenyQos=ds-short,ds-medium,ds-long'
- name: gpu_a40 # Must be in sync with group listed in Ansible inventory.
default: no
nodes: "{{ stack_prefix }}-vcompute04" # Must be in sync with Ansible hostnames listed in inventory.
nodes: "{{ stack_prefix }}-vcompute[04-05]" # Must be in sync with Ansible hostnames listed in inventory.
max_nodes_per_job: "{% if slurm_allow_jobs_to_span_nodes is defined and slurm_allow_jobs_to_span_nodes is true %}{{ groups['gpu_a40']|list|length }}{% else %}1{% endif %}"
max_cores_per_node: "{{ groups['gpu_a40'] | map('extract', hostvars, 'slurm_max_cpus_per_node') | first }}"
max_mem_per_node: "{{ groups['gpu_a40'] | map('extract', hostvars, 'slurm_max_mem_per_node') | first }}"
Expand All @@ -38,11 +38,11 @@ motd: |
=========================================================
!!! WARNING: {{ slurm_cluster_name | capitalize }} is in beta testing
=========================================================
This cluster may be redeployed from scratch, which
may result in complete data loss of home dirs
This cluster may be redeployed from scratch,
which may result in complete data loss of home dirs
and tmp0* group folders: You have been warned!!!
This does not affect prm0* group folders,
which are on a different (production) storage system.
This does not affect other group folders (if present),
which are on different (production) storage systems.
=========================================================
additional_etc_hosts:
- group: docs_library
Expand Down Expand Up @@ -180,7 +180,9 @@ regular_groups:
- 'umcg-datateam'
- 'umcg-ejp-rd'
- 'umcg-endocrinology'
- 'umcg-fg'
- 'umcg-franke-scrna'
- 'umcg-fu'
- 'umcg-gaf'
- 'umcg-gap'
- 'umcg-gastrocol'
Expand All @@ -190,13 +192,16 @@ regular_groups:
- 'umcg-griac'
- 'umcg-gsad'
- 'umcg-hematology'
- 'umcg-immunogenetics'
- 'umcg-impact'
- 'umcg-lifelines'
- 'umcg-lld'
- 'umcg-llnext'
- 'umcg-mic'
- 'umcg-micompany'
- 'umcg-mmbimg'
- 'umcg-msb'
- 'umcg-nawijn'
- 'umcg-oncogenetics'
- 'umcg-pmb'
- 'umcg-pub'
Expand All @@ -208,6 +213,7 @@ regular_groups:
- 'umcg-ukb'
- 'umcg-ugli'
- 'umcg-verbeek'
- 'umcg-vdakker'
- 'umcg-weersma'
- 'umcg-wijmenga'
regular_users:
Expand Down Expand Up @@ -252,9 +258,15 @@ regular_users:
- user: 'umcg-endocrinology-dm'
groups: ['umcg-endocrinology']
sudoers: '%umcg-endocrinology-dms'
- user: 'umcg-fg-dm'
groups: ['umcg-fg']
sudoers: '%umcg-fg-dms'
- user: 'umcg-franke-scrna-dm'
groups: ['umcg-franke-scrna']
sudoers: '%umcg-franke-scrna-dms'
- user: 'umcg-fu-dm'
groups: ['umcg-fu']
sudoers: '%umcg-fu-dms'
- user: 'umcg-gaf-ateambot'
groups: ['umcg-gaf']
sudoers: 'umcg-gvdvries,umcg-kdelange,umcg-mbenjamins,umcg-mbijlsma,umcg-pneerincx,umcg-rkanninga'
Expand Down Expand Up @@ -288,6 +300,9 @@ regular_users:
- user: 'umcg-hematology-dm'
groups: ['umcg-hematology']
sudoers: '%umcg-hematology-dms'
- user: 'umcg-immunogenetics-dm'
groups: ['umcg-immunogenetics']
sudoers: '%umcg-immunogenetics-dms'
- user: 'umcg-impact-dm'
groups: ['umcg-impact']
sudoers: '%umcg-impact'
Expand All @@ -300,6 +315,9 @@ regular_users:
- user: 'umcg-llnext-dm'
groups: ['umcg-llnext']
sudoers: '%umcg-llnext-dms'
- user: 'umcg-mic-dm'
groups: ['umcg-mic']
sudoers: '%umcg-mic-dms'
- user: 'umcg-micompany-dm'
groups: ['umcg-micompany']
sudoers: '%umcg-micompany-dms'
Expand All @@ -309,6 +327,9 @@ regular_users:
- user: 'umcg-msb-dm'
groups: ['umcg-msb']
sudoers: '%umcg-msb'
- user: 'umcg-nawijn-dm'
groups: ['umcg-nawijn']
sudoers: '%umcg-nawijn-dms'
- user: 'umcg-oncogenetics-dm'
groups: ['umcg-oncogenetics']
sudoers: '%umcg-oncogenetics'
Expand Down Expand Up @@ -342,6 +363,9 @@ regular_users:
- user: 'umcg-verbeek-dm'
groups: ['umcg-verbeek']
sudoers: '%umcg-verbeek'
- user: 'umcg-vdakker-dm'
groups: ['umcg-vdakker']
sudoers: '%umcg-vdakker-dms'
- user: 'umcg-weersma-dm'
groups: ['umcg-weersma']
sudoers: '%umcg-weersma-dms'
Expand Down Expand Up @@ -375,125 +399,35 @@ volume_group_folders: [
# Shared storage related variables
#
lustre_client_networks:
- name: tcp??
interface: eth?
- name: tcp20
interface: eth2
pfs_mounts:
- pfs: umcgst01
#source: '172.23.57.201@tcp11:172.23.57.202@tcp11:/dh1'
- pfs: umcgst04
source: '172.23.60.161@tcp20:172.23.60.162@tcp20:/'
type: lustre
rw_options: 'defaults,_netdev,flock'
ro_options: 'defaults,_netdev,ro'
machines: "{{ groups['sys_admin_interface'] }}"
lfs_mounts:
- lfs: home
pfs: xyz
pfs: umcgst04
rw_machines: "{{ groups['cluster'] }}"
- lfs: tmp02
pfs: xyz
pfs: umcgst04
groups:
- name: umcg-aad
- name: umcg-as
- name: umcg-atd
- name: umcg-biogen
- name: umcg-bionic-mdd-gwama
- name: umcg-bios
- name: umcg-cineca
- name: umcg-dag3
- name: umcg-datateam
- name: umcg-ejp-rd
- name: umcg-endocrinology
- name: umcg-franke-scrna
- name: umcg-fg
mode: '2750'
- name: umcg-fu
- name: umcg-gaf
- name: umcg-gap
- name: umcg-gastrocol
- name: umcg-gcc
- name: umcg-gdio
- name: umcg-gonl
mode: '2750'
- name: umcg-griac
- name: umcg-gsad
- name: umcg-hematology
- name: umcg-immunogenetics
- name: umcg-impact
- name: umcg-lifelines
- name: umcg-lld
- name: umcg-llnext
- name: umcg-mic
- name: umcg-micompany
- name: umcg-mmbimg
- name: umcg-msb
- name: umcg-oncogenetics
- name: umcg-pmb
- name: umcg-pub
mode: '2750'
- name: umcg-radicon
- name: umcg-rehabilitation
- name: umcg-solve-rd
- name: umcg-sysops
- name: umcg-tifn
- name: umcg-ukb
- name: umcg-fg
mode: '2750'
- name: umcg-ugli
- name: umcg-verbeek
- name: umcg-vdakker
- name: umcg-weersma
- name: umcg-wijmenga
rw_machines: "{{ groups['user_interface'] + groups['deploy_admin_interface'] + groups['compute_vm'] }}"
- lfs: prm02
pfs: umcgst01
groups:
- name: umcg-aad
- name: umcg-as
- name: umcg-atd
- name: umcg-biogen
- name: umcg-bionic-mdd-gwama
- name: umcg-bios
- name: umcg-cineca
- name: umcg-dag3
- name: umcg-datateam
- name: umcg-ejp-rd
- name: umcg-endocrinology
- name: umcg-franke-scrna
- name: umcg-fg
- name: umcg-fu
- name: umcg-gaf
- name: umcg-gap
- name: umcg-gastrocol
- name: umcg-gcc
- name: umcg-gdio
- name: umcg-gonl
- name: umcg-griac
- name: umcg-gsad
- name: umcg-hematology
- name: umcg-immunogenetics
- name: umcg-impact
- name: umcg-lifelines
- name: umcg-lld
- name: umcg-llnext
- name: umcg-mic
- name: umcg-micompany
- name: umcg-mmbimg
- name: umcg-msb
- name: umcg-oncogenetics
- name: umcg-pmb
- name: umcg-pub
- name: umcg-radicon
- name: umcg-rehabilitation
- name: umcg-solve-rd
- name: umcg-sysops
- name: umcg-tifn
- name: umcg-ukb
- name: umcg-ugli
- name: umcg-verbeek
- name: umcg-vdakker
- name: umcg-weersma
- name: umcg-wijmenga
rw_machines: "{{ groups['user_interface'] }}"
# - lfs: prm02
# pfs: umcgst01
# groups:
# - name: umcg-atd
# - name: umcg-fg
# rw_machines: "{{ groups['user_interface'] }}"
- lfs: env02
pfs: xyz
pfs: umcgst04
ro_machines: "{{ groups['compute_vm'] + groups['user_interface'] }}"
rw_machines: "{{ groups['deploy_admin_interface'] }}"
...
2 changes: 1 addition & 1 deletion roles/sshd/templates/pam-weblogin.conf
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ attribute = {{ pam_weblogin['user_name'] | default('uid') }}
# How long it takes (in seconds) before a user needs to prove their federated ID is still valid
# via login using authentication web server from their institute.
#
cache_duration = {{ pam_weblogin['cache_duration'] | default(3600) }}
cache_duration = {{ pam_weblogin['cache_duration'] | default(64800) }}
13 changes: 8 additions & 5 deletions static_inventories/nibbler_cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,11 @@ all:
slurm_cores_per_socket: 1
slurm_real_memory: 15884
slurm_local_disk: 0
slurm_features: 'prm02,tmp02'
slurm_features: 'tmp02'
slurm_ethernet_interfaces:
- eth0
- eth1
- eth2
compute_vm:
children:
regular: # Must be item from {{ slurm_partitions }} variable defined in group_vars/{{ stack_name }}/vars.yml
Expand All @@ -160,19 +161,20 @@ all:
- name: vlan1068
security_group: "{{ stack_prefix }}_storage"
local_volume_size_extra: 1
slurm_sockets: 8
slurm_sockets: 40
slurm_cores_per_socket: 1
slurm_real_memory: 15884
slurm_real_memory: 181180
slurm_max_cpus_per_node: "{{ slurm_sockets * slurm_cores_per_socket - 2 }}"
slurm_max_mem_per_node: "{{ slurm_real_memory - slurm_sockets * slurm_cores_per_socket * 512 }}"
slurm_local_disk: 975
slurm_features: 'tmp02'
slurm_ethernet_interfaces:
- eth0
- eth1
- eth2
gpu_a40: # Must be item from {{ slurm_partitions }} variable defined in group_vars/{{ stack_name }}/vars.yml
hosts:
nb-vcompute[04:05]
nb-vcompute[04:05]:
vars:
cloud_flavor: gpu.A40_8
gpu_count: 8
Expand All @@ -186,14 +188,15 @@ all:
local_volume_size_extra: 1
slurm_sockets: 32
slurm_cores_per_socket: 1
slurm_real_memory: 98304
slurm_real_memory: 110122
slurm_max_cpus_per_node: "{{ slurm_sockets * slurm_cores_per_socket - 2 }}"
slurm_max_mem_per_node: "{{ slurm_real_memory - slurm_sockets * slurm_cores_per_socket * 512 }}"
slurm_local_disk: 975
slurm_features: 'tmp02,gpu,A40'
slurm_ethernet_interfaces:
- eth0
- eth1
- eth2
administration:
children:
sys_admin_interface:
Expand Down