From e48a3fdb5d0a5b4720e0b3f3bf002cbef730f103 Mon Sep 17 00:00:00 2001 From: scimerman Date: Mon, 17 Oct 2022 13:08:09 +0200 Subject: [PATCH 01/30] Nibbler: added 8xGPU vars --- static_inventories/nibbler_cluster.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static_inventories/nibbler_cluster.yml b/static_inventories/nibbler_cluster.yml index 8bb30c506..82430ec93 100644 --- a/static_inventories/nibbler_cluster.yml +++ b/static_inventories/nibbler_cluster.yml @@ -126,7 +126,7 @@ all: hosts: nb-vcompute04: vars: - cloud_flavor: gpu.A40 + cloud_flavor: gpu.A40_8 local_volume_size_extra: 1 slurm_sockets: 32 slurm_cores_per_socket: 1 From ab039445b55eddb1925b1ae674c205288e76ccd7 Mon Sep 17 00:00:00 2001 From: scimerman Date: Mon, 17 Oct 2022 13:09:32 +0200 Subject: [PATCH 02/30] Added GPU role --- roles/gpu/README.md | 6 ++++ roles/gpu/defaults/main.yml | 6 ++++ roles/gpu/files/blacklist-nouveau.conf | 2 ++ roles/gpu/files/nvidia-persistenced.service | 10 ++++++ roles/gpu/handlers/main.yml | 8 +++++ roles/gpu/tasks/configuration.yml | 24 +++++++++++++ roles/gpu/tasks/gpu.yml | 40 +++++++++++++++++++++ roles/gpu/tasks/main.yml | 19 ++++++++++ single_role_playbooks/gpu.yml | 5 +++ 9 files changed, 120 insertions(+) create mode 100644 roles/gpu/README.md create mode 100644 roles/gpu/defaults/main.yml create mode 100644 roles/gpu/files/blacklist-nouveau.conf create mode 100644 roles/gpu/files/nvidia-persistenced.service create mode 100644 roles/gpu/handlers/main.yml create mode 100644 roles/gpu/tasks/configuration.yml create mode 100644 roles/gpu/tasks/gpu.yml create mode 100644 roles/gpu/tasks/main.yml create mode 100644 single_role_playbooks/gpu.yml diff --git a/roles/gpu/README.md b/roles/gpu/README.md new file mode 100644 index 000000000..4391f22b5 --- /dev/null +++ b/roles/gpu/README.md @@ -0,0 +1,6 @@ +# NVidia GPU installation role for Centos 7 + +This role follows the latest instructions of the newest version of available +drivers, avaiable at [NVIDIA CUDA Installation Guide for +Linux](https://docs.nvidia.com/cuda/pdf/CUDA_Installation_Guide_Linux.pdf). + diff --git a/roles/gpu/defaults/main.yml b/roles/gpu/defaults/main.yml new file mode 100644 index 000000000..3a2f55b29 --- /dev/null +++ b/roles/gpu/defaults/main.yml @@ -0,0 +1,6 @@ +--- +gpu_cuda_version: '11.7.1' +gpu_driver_version: '515.65.01' +gpu_url_directory: 'https://developer.download.nvidia.com/compute/cuda/{{ gpu_cuda_version }}/local_installers/' +gpu_runfile: 'cuda_{{ gpu_cuda_version }}_{{ gpu_driver_version }}_linux.run' +... diff --git a/roles/gpu/files/blacklist-nouveau.conf b/roles/gpu/files/blacklist-nouveau.conf new file mode 100644 index 000000000..c9b9bfcf7 --- /dev/null +++ b/roles/gpu/files/blacklist-nouveau.conf @@ -0,0 +1,2 @@ +blacklist nouveau +options nouveau modeset=0 diff --git a/roles/gpu/files/nvidia-persistenced.service b/roles/gpu/files/nvidia-persistenced.service new file mode 100644 index 000000000..85df2bdaa --- /dev/null +++ b/roles/gpu/files/nvidia-persistenced.service @@ -0,0 +1,10 @@ +[Unit] +Description=Initialize GPU at the startup of the system + +[Service] +ExecStart=/usr/bin/nvidia-persistenced --verbose +RestartSec=15 +Restart=always + +[Install] +WantedBy=multi-user.target diff --git a/roles/gpu/handlers/main.yml b/roles/gpu/handlers/main.yml new file mode 100644 index 000000000..de6bb1d34 --- /dev/null +++ b/roles/gpu/handlers/main.yml @@ -0,0 +1,8 @@ +--- +# - name: Restart apache +# ansible.builtin.service: +# name: httpd +# state: restarted +# become: true +# listen: "Reboot the machineestart web services" +... diff --git a/roles/gpu/tasks/configuration.yml b/roles/gpu/tasks/configuration.yml new file mode 100644 index 000000000..529ab53f8 --- /dev/null +++ b/roles/gpu/tasks/configuration.yml @@ -0,0 +1,24 @@ +--- +- name: Copy blacklist-nouveau.conf file into modprobe.d to disable Nouveau drivers + ansible.builtin.copy: + src: blacklist-nouveau.conf + dest: /etc/modprobe.d/blacklist-nouveau.conf + owner: root + group: root + mode: '0644' + become: true + +- name: Install NVidia persistence service + ansible.builtin.copy: + src: nvidia-persistenced.service + dest: /etc/systemd/system/nvidia-persistenced.service + validate: systemd-analyze verify %s + become: true + +- name: Enable a nvidia-persistence service + ansible.builtin.systemd: + name: nvidia-persistenced.service + state: started + enabled: yes + become: true +... diff --git a/roles/gpu/tasks/gpu.yml b/roles/gpu/tasks/gpu.yml new file mode 100644 index 000000000..654a47d32 --- /dev/null +++ b/roles/gpu/tasks/gpu.yml @@ -0,0 +1,40 @@ +--- +- name: Install yum requirements for gpu driver installation + ansible.builtin.yum: + state: 'installed' + update_cache: true + name: + - tar + - bzip2 + - make + - automake + - gcc + - gcc-c++ + - pciutils + - elfutils-libelf-devel + - libglvnd-devel + - bind-utils + - wget + become: true + + +- name: Download a driver installation file from NVidia + ansible.builtin.get_url: + url: '{{ gpu_url_directory }}/{{ gpu_runfile }}' + dest: '/root/{{ gpu_runfile }}' + mode: '0700' + # checksum: md5:66dffb5228a211e61d6d7ef4a86f5758 + become: true + +- name: Install driver from .run file + ansible.builtin.command: '/root/{{ gpu_runfile }} --silent --driver' + register: run_result + failed_when: run_result.rc != 0 + become: true + +- name: Remove installation file + ansible.builtin.file: + path: '/root/{{ gpu_runfile }}' + state: absent + become: true +... diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml new file mode 100644 index 000000000..8b99f2501 --- /dev/null +++ b/roles/gpu/tasks/main.yml @@ -0,0 +1,19 @@ +--- +- name: Check if we have CUDA capable system + ansible.builtin.command: 'lspci' + register: lspci_nv + become: true + +- name: Check if we have already configured NVidia devices + ansible.builtin.command: 'nvidia-smi' + register: detect_devices + failed_when: false + become: true + +- name: Run GPU driver installation role + ansible.builtin.include: gpu.yml + when: ('"nvidia" in lspci_nv.stdout | lower') and (detect_devices.rc != 0) + +- name: Set configuration files for service and modprobe + ansible.builtin.include: configuration.yml +... diff --git a/single_role_playbooks/gpu.yml b/single_role_playbooks/gpu.yml new file mode 100644 index 000000000..0129d1966 --- /dev/null +++ b/single_role_playbooks/gpu.yml @@ -0,0 +1,5 @@ +--- +- hosts: cluster + roles: + - gpu +... From e2258fc7843a5c0df0e33e1d7271d9d707a5ee72 Mon Sep 17 00:00:00 2001 From: scimerman Date: Mon, 17 Oct 2022 13:12:59 +0200 Subject: [PATCH 03/30] GPU: added to single goups cluster_part1 --- single_group_playbooks/cluster_part1.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/single_group_playbooks/cluster_part1.yml b/single_group_playbooks/cluster_part1.yml index 74a8a8e83..5599ba366 100644 --- a/single_group_playbooks/cluster_part1.yml +++ b/single_group_playbooks/cluster_part1.yml @@ -18,6 +18,7 @@ - logrotate - remove - update + - gpu - logins - figlet_motd - node_exporter From 950d0a1f07348a90c5f1f5b3a4bae29a8a4d76b9 Mon Sep 17 00:00:00 2001 From: scimerman Date: Thu, 20 Oct 2022 11:28:00 +0200 Subject: [PATCH 04/30] GPU: update --- roles/cluster/defaults/main.yml | 2 ++ roles/gpu/handlers/main.yml | 11 ++++---- roles/gpu/tasks/configuration.yml | 5 ++-- roles/gpu/tasks/gpu.yml | 33 +++++++++++++++++++++--- roles/gpu/tasks/main.yml | 14 ++++++++++ single_group_playbooks/cluster_part1.yml | 2 +- 6 files changed, 54 insertions(+), 13 deletions(-) diff --git a/roles/cluster/defaults/main.yml b/roles/cluster/defaults/main.yml index 2c053fcc5..b0a1b7995 100644 --- a/roles/cluster/defaults/main.yml +++ b/roles/cluster/defaults/main.yml @@ -20,6 +20,7 @@ cluster_common_packages: - ncurses-static - net-tools - openssl + - pciutils - qt5-qtbase - qt5-qtxmlpatterns - readline-static @@ -36,4 +37,5 @@ cluster_common_packages: - urw-base35-fonts - vim - wget + - yum-utils ... diff --git a/roles/gpu/handlers/main.yml b/roles/gpu/handlers/main.yml index de6bb1d34..cf1a26662 100644 --- a/roles/gpu/handlers/main.yml +++ b/roles/gpu/handlers/main.yml @@ -1,8 +1,7 @@ --- -# - name: Restart apache -# ansible.builtin.service: -# name: httpd -# state: restarted -# become: true -# listen: "Reboot the machineestart web services" +- name: Restart server because of the pending updates + ansible.builtin.reboot: + msg: "Reboot initiated by Ansible" + listen: "reboot_server" + become: true ... diff --git a/roles/gpu/tasks/configuration.yml b/roles/gpu/tasks/configuration.yml index 529ab53f8..eb5e758a5 100644 --- a/roles/gpu/tasks/configuration.yml +++ b/roles/gpu/tasks/configuration.yml @@ -12,13 +12,12 @@ ansible.builtin.copy: src: nvidia-persistenced.service dest: /etc/systemd/system/nvidia-persistenced.service - validate: systemd-analyze verify %s become: true -- name: Enable a nvidia-persistence service +- name: Enable a nvidia-persistenced service ansible.builtin.systemd: name: nvidia-persistenced.service state: started - enabled: yes + enabled: true become: true ... diff --git a/roles/gpu/tasks/gpu.yml b/roles/gpu/tasks/gpu.yml index 654a47d32..c08224742 100644 --- a/roles/gpu/tasks/gpu.yml +++ b/roles/gpu/tasks/gpu.yml @@ -17,19 +17,46 @@ - wget become: true +# ansible_kernel variable is not working, as after reboot, still holds old kernel +- name: Get current kernel version + ansible.builtin.command: '/usr/bin/uname -r' + register: uname_output + failed_when: uname_output.rc != 0 + when: true + become: true + +- name: Set kernel version fact + ansible.builtin.set_fact: + kernel_version: "{{ uname_output.stdout }}" + +- name: Install kernel developement package matching running kernel version + ansible.builtin.yum: + name: 'kernel-devel-{{ kernel_version }}' + register: yum_result + failed_when: yum_result.rc != 0 + when: true + become: true - name: Download a driver installation file from NVidia ansible.builtin.get_url: url: '{{ gpu_url_directory }}/{{ gpu_runfile }}' dest: '/root/{{ gpu_runfile }}' mode: '0700' - # checksum: md5:66dffb5228a211e61d6d7ef4a86f5758 + become: true + +- name: "Check if driver downloaded" + ansible.builtin.stat: + path: '/root/{{ gpu_runfile }}' + when: true + register: driver_downloaded become: true - name: Install driver from .run file ansible.builtin.command: '/root/{{ gpu_runfile }} --silent --driver' - register: run_result - failed_when: run_result.rc != 0 + register: install_result + failed_when: install_result.rc != 0 + when: driver_downloaded.stat.exists + notify: reboot_server become: true - name: Remove installation file diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml index 8b99f2501..b99904949 100644 --- a/roles/gpu/tasks/main.yml +++ b/roles/gpu/tasks/main.yml @@ -1,13 +1,27 @@ --- + +- name: Check if system needs to be restarted + ansible.builtin.command: '/bin/needs-restarting -r' + register: needs_restarting + failed_when: 'needs_restarting.rc > 1' + changed_when: 'needs_restarting.rc == 1' + become: true + notify: reboot_server + +- name: Flush handlers + ansible.builtin.meta: flush_handlers + - name: Check if we have CUDA capable system ansible.builtin.command: 'lspci' register: lspci_nv + when: true become: true - name: Check if we have already configured NVidia devices ansible.builtin.command: 'nvidia-smi' register: detect_devices failed_when: false + when: true become: true - name: Run GPU driver installation role diff --git a/single_group_playbooks/cluster_part1.yml b/single_group_playbooks/cluster_part1.yml index 5599ba366..5cc4d9de9 100644 --- a/single_group_playbooks/cluster_part1.yml +++ b/single_group_playbooks/cluster_part1.yml @@ -18,11 +18,11 @@ - logrotate - remove - update - - gpu - logins - figlet_motd - node_exporter - cluster + - gpu # depends on cluster role - resolver - coredumps ... From 9d8c7af2d70f31b8853c58a1058669ef5ae0c02e Mon Sep 17 00:00:00 2001 From: scimerman Date: Fri, 4 Nov 2022 17:47:33 +0100 Subject: [PATCH 05/30] GPU update --- roles/gpu/README.md | 15 +++++++++++++++ roles/gpu/tasks/main.yml | 1 - single_group_playbooks/cluster_part1.yml | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/roles/gpu/README.md b/roles/gpu/README.md index 4391f22b5..147d3e709 100644 --- a/roles/gpu/README.md +++ b/roles/gpu/README.md @@ -4,3 +4,18 @@ This role follows the latest instructions of the newest version of available drivers, avaiable at [NVIDIA CUDA Installation Guide for Linux](https://docs.nvidia.com/cuda/pdf/CUDA_Installation_Guide_Linux.pdf). + + +## Role outline + +- installs `pciutils` tools +- checks if there is pci device from nvidia and if there is, then it +- installs all the needed packages to build the driver +- downloads the .run driver from nvidia (driver version is defined in defualts) +- installs and compile the driver module +- installs systemd service file, that automatically loads the driver upons system + boot, and that reloads the driver when/if it has failed operating + +TO-DO: +- extensive testing and benchmarking +- role for development software installation diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml index b99904949..7407e6cc5 100644 --- a/roles/gpu/tasks/main.yml +++ b/roles/gpu/tasks/main.yml @@ -1,5 +1,4 @@ --- - - name: Check if system needs to be restarted ansible.builtin.command: '/bin/needs-restarting -r' register: needs_restarting diff --git a/single_group_playbooks/cluster_part1.yml b/single_group_playbooks/cluster_part1.yml index 5cc4d9de9..e3941660f 100644 --- a/single_group_playbooks/cluster_part1.yml +++ b/single_group_playbooks/cluster_part1.yml @@ -22,7 +22,7 @@ - figlet_motd - node_exporter - cluster - - gpu # depends on cluster role + - gpu # needs to run after role 'cluster' - resolver - coredumps ... From f2ff955da1995334680877369a0576d477387b6b Mon Sep 17 00:00:00 2001 From: scimerman Date: Fri, 4 Nov 2022 17:50:00 +0100 Subject: [PATCH 06/30] GPU: update --- roles/gpu/README.md | 7 +++---- roles/gpu/handlers/main.yml | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/roles/gpu/README.md b/roles/gpu/README.md index 147d3e709..42da98e07 100644 --- a/roles/gpu/README.md +++ b/roles/gpu/README.md @@ -4,18 +4,17 @@ This role follows the latest instructions of the newest version of available drivers, avaiable at [NVIDIA CUDA Installation Guide for Linux](https://docs.nvidia.com/cuda/pdf/CUDA_Installation_Guide_Linux.pdf). - - ## Role outline - installs `pciutils` tools - checks if there is pci device from nvidia and if there is, then it -- installs all the needed packages to build the driver +- installs on system needed yum packages that can later build the driver - downloads the .run driver from nvidia (driver version is defined in defualts) - installs and compile the driver module +- blacklists nouveau - installs systemd service file, that automatically loads the driver upons system boot, and that reloads the driver when/if it has failed operating -TO-DO: +## TO-DO - extensive testing and benchmarking - role for development software installation diff --git a/roles/gpu/handlers/main.yml b/roles/gpu/handlers/main.yml index cf1a26662..356b88410 100644 --- a/roles/gpu/handlers/main.yml +++ b/roles/gpu/handlers/main.yml @@ -1,7 +1,7 @@ --- - name: Restart server because of the pending updates ansible.builtin.reboot: - msg: "Reboot initiated by Ansible" + msg: "Reboot initiated by Ansible, because of the pending updates" listen: "reboot_server" become: true ... From bc06f59763be7888116e6a2200458eaec822a484 Mon Sep 17 00:00:00 2001 From: scimerman Date: Fri, 4 Nov 2022 17:56:20 +0100 Subject: [PATCH 07/30] GPU: limit hosts --- single_role_playbooks/gpu.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/single_role_playbooks/gpu.yml b/single_role_playbooks/gpu.yml index 0129d1966..d8651a6ed 100644 --- a/single_role_playbooks/gpu.yml +++ b/single_role_playbooks/gpu.yml @@ -1,5 +1,6 @@ --- -- hosts: cluster +- hosts: + - compute_vm roles: - gpu ... From 7b929585bde21d5ce6ad09afdaf57ec213176d64 Mon Sep 17 00:00:00 2001 From: scimerman Date: Mon, 14 Nov 2022 13:16:57 +0100 Subject: [PATCH 08/30] GPU: fixes --- roles/gpu/tasks/gpu.yml | 43 +++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/roles/gpu/tasks/gpu.yml b/roles/gpu/tasks/gpu.yml index c08224742..97e32f183 100644 --- a/roles/gpu/tasks/gpu.yml +++ b/roles/gpu/tasks/gpu.yml @@ -17,21 +17,25 @@ - wget become: true -# ansible_kernel variable is not working, as after reboot, still holds old kernel -- name: Get current kernel version - ansible.builtin.command: '/usr/bin/uname -r' - register: uname_output - failed_when: uname_output.rc != 0 - when: true - become: true +## ansible_kernel variable is not working, as after reboot, still holds old kernel +#- name: Get current kernel version +# ansible.builtin.command: '/usr/bin/uname -r' +# register: uname_output +# failed_when: uname_output.rc != 0 +# when: true +# become: true +# +#- name: Set kernel version fact +# ansible.builtin.set_fact: +# kernel_version: "{{ uname_output.stdout }}" -- name: Set kernel version fact - ansible.builtin.set_fact: - kernel_version: "{{ uname_output.stdout }}" +- name: Gather facts to get the latest kernel version + ansible.builtin.setup: + become: true - name: Install kernel developement package matching running kernel version ansible.builtin.yum: - name: 'kernel-devel-{{ kernel_version }}' + name: 'kernel-devel-{{ ansible_kernel }}' register: yum_result failed_when: yum_result.rc != 0 when: true @@ -44,21 +48,22 @@ mode: '0700' become: true -- name: "Check if driver downloaded" - ansible.builtin.stat: - path: '/root/{{ gpu_runfile }}' - when: true - register: driver_downloaded - become: true - - name: Install driver from .run file ansible.builtin.command: '/root/{{ gpu_runfile }} --silent --driver' register: install_result failed_when: install_result.rc != 0 - when: driver_downloaded.stat.exists + when: true notify: reboot_server become: true +- name: Enforce reboot, so that we can check if drivers are correctly isntalled + ansible.builtin.meta: flush_handlers + +- name: Check that drivers are correctly installed + ansible.builtin.command: 'nvidia-smi -L' + register: smi_output + failed_when: (smi_output.rc > 1) or ({{ smi_output | default([]) | length < 1 }}) + - name: Remove installation file ansible.builtin.file: path: '/root/{{ gpu_runfile }}' From f388a18db93ede65f58ee31d4beb3599f1f8f210 Mon Sep 17 00:00:00 2001 From: scimerman Date: Fri, 18 Nov 2022 17:45:07 +0100 Subject: [PATCH 09/30] GPU role update --- roles/gpu/defaults/main.yml | 7 ++- roles/gpu/tasks/gpu.yml | 79 +++++++++++++++++++------- roles/gpu/tasks/main.yml | 25 ++++---- static_inventories/nibbler_cluster.yml | 3 +- 4 files changed, 77 insertions(+), 37 deletions(-) diff --git a/roles/gpu/defaults/main.yml b/roles/gpu/defaults/main.yml index 3a2f55b29..ff0bd0357 100644 --- a/roles/gpu/defaults/main.yml +++ b/roles/gpu/defaults/main.yml @@ -1,6 +1,11 @@ --- gpu_cuda_version: '11.7.1' gpu_driver_version: '515.65.01' -gpu_url_directory: 'https://developer.download.nvidia.com/compute/cuda/{{ gpu_cuda_version }}/local_installers/' +gpu_url_dir: 'https://developer.download.nvidia.com/compute/cuda/{{ gpu_cuda_version }}/local_installers/' gpu_runfile: 'cuda_{{ gpu_cuda_version }}_{{ gpu_driver_version }}_linux.run' + +# gpu_dl_site: 'https://us.download.nvidia.com/tesla' +# gpu_driver_version: '450.80.02' +# gpu_runfile: 'NVIDIA-Linux-x86_64-{{ gpu_driver_version }}.run' +# gpu_url: '{{ gpu_dl_site }}/{{ gpu_driver_version }}/{{ gpu_runfile }}' ... diff --git a/roles/gpu/tasks/gpu.yml b/roles/gpu/tasks/gpu.yml index 97e32f183..5aab78601 100644 --- a/roles/gpu/tasks/gpu.yml +++ b/roles/gpu/tasks/gpu.yml @@ -17,17 +17,16 @@ - wget become: true -## ansible_kernel variable is not working, as after reboot, still holds old kernel -#- name: Get current kernel version -# ansible.builtin.command: '/usr/bin/uname -r' -# register: uname_output -# failed_when: uname_output.rc != 0 -# when: true -# become: true -# -#- name: Set kernel version fact -# ansible.builtin.set_fact: -# kernel_version: "{{ uname_output.stdout }}" +# # ansible_kernel variable is not working, as after reboot, still holds old kernel +# - name: Get current kernel version +# ansible.builtin.command: '/usr/bin/uname -r' +# register: uname_output +# failed_when: uname_output.rc != 0 +# when: true +# become: true +# - name: Set kernel version fact +# ansible.builtin.set_fact: +# kernel_version: "{{ uname_output.stdout }}" - name: Gather facts to get the latest kernel version ansible.builtin.setup: @@ -43,7 +42,7 @@ - name: Download a driver installation file from NVidia ansible.builtin.get_url: - url: '{{ gpu_url_directory }}/{{ gpu_runfile }}' + url: '{{ gpu_url_dir }}/{{ gpu_runfile }}' dest: '/root/{{ gpu_runfile }}' mode: '0700' become: true @@ -53,20 +52,56 @@ register: install_result failed_when: install_result.rc != 0 when: true - notify: reboot_server become: true -- name: Enforce reboot, so that we can check if drivers are correctly isntalled +# - name: Remove installation file +# ansible.builtin.file: +# path: '/root/{{ gpu_runfile }}' +# state: absent +# become: true + +- name: Copy blacklist-nouveau.conf file into modprobe.d to disable Nouveau drivers + ansible.builtin.copy: + src: blacklist-nouveau.conf + dest: /etc/modprobe.d/blacklist-nouveau.conf + owner: root + group: root + mode: '0644' + become: true + notify: 'reboot_server' + +# - name: Install NVidia persistence service +# ansible.builtin.copy: +# src: nvidia-persistenced.service +# dest: /etc/systemd/system/nvidia-persistenced.service +# become: true +# notify: 'reboot_server' + +# - name: Enable a nvidia-persistenced service +# ansible.builtin.systemd: +# name: nvidia-persistenced.service +# state: started +# enabled: true +# become: true +# notify: 'reboot_server' + +- name: Enforce reboot, so that we can check if drivers are correctly installed ansible.builtin.meta: flush_handlers -- name: Check that drivers are correctly installed - ansible.builtin.command: 'nvidia-smi -L' - register: smi_output - failed_when: (smi_output.rc > 1) or ({{ smi_output | default([]) | length < 1 }}) +- name: Count the number of available nvidia devices + # ansible.builtin.command: '/usr/bin/ls /dev/nvidia[0-9]{,[0-9]' + ansible.builtin.command: 'lspci' + register: lspci + when: true + changed_when: false + become: true -- name: Remove installation file - ansible.builtin.file: - path: '/root/{{ gpu_runfile }}' - state: absent +- name: Check that nvidia-smi sees all devices + ansible.builtin.command: 'nvidia-smi -L' + register: smi + when: true + changed_when: false + failed_when: ( smi.rc > 0) or + ( lspci.stdout|lower|regex_findall('nvidia')|length != smi.stdout|lower|regex_findall('nvidia')|length ) become: true ... diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml index 7407e6cc5..13bad47f4 100644 --- a/roles/gpu/tasks/main.yml +++ b/roles/gpu/tasks/main.yml @@ -10,23 +10,22 @@ - name: Flush handlers ansible.builtin.meta: flush_handlers -- name: Check if we have CUDA capable system - ansible.builtin.command: 'lspci' - register: lspci_nv - when: true - become: true +# - name: Check if we have CUDA capable system +# ansible.builtin.command: 'lspci' +# register: lspci +# changed_when: false +# when: true +# become: true - name: Check if we have already configured NVidia devices - ansible.builtin.command: 'nvidia-smi' - register: detect_devices + ansible.builtin.command: 'nvidia-smi -L' + register: smi + changed_when: false failed_when: false - when: true become: true -- name: Run GPU driver installation role +- name: Install GPU driver if not all GPU devices are present and working ansible.builtin.include: gpu.yml - when: ('"nvidia" in lspci_nv.stdout | lower') and (detect_devices.rc != 0) - -- name: Set configuration files for service and modprobe - ansible.builtin.include: configuration.yml + when: ( gpu_count is defined ) and + ( smi.stdout|default([])|lower|regex_findall('nvidia')|length != gpu_count ) ... diff --git a/static_inventories/nibbler_cluster.yml b/static_inventories/nibbler_cluster.yml index 14c5c0cae..b9f4896e0 100644 --- a/static_inventories/nibbler_cluster.yml +++ b/static_inventories/nibbler_cluster.yml @@ -91,7 +91,7 @@ all: hosts: nb-dai: cloud_flavor: m1.small - local_volume_size_extra: 200 + local_volume_size_extra: 3000 user_interface: hosts: nibbler: @@ -128,6 +128,7 @@ all: nb-vcompute04: vars: cloud_flavor: gpu.A40_8 + gpu_count: 8 local_volume_size_extra: 1 slurm_sockets: 32 slurm_cores_per_socket: 1 From 33ad9c576359787496b824c344945f5e9fffbeae Mon Sep 17 00:00:00 2001 From: scimerman Date: Fri, 18 Nov 2022 17:48:27 +0100 Subject: [PATCH 10/30] GPU role: removed unneeded files and functions --- roles/gpu/tasks/configuration.yml | 23 ----------------- roles/gpu/tasks/gpu.yml | 42 ++++++------------------------- roles/gpu/tasks/main.yml | 7 ------ 3 files changed, 8 insertions(+), 64 deletions(-) delete mode 100644 roles/gpu/tasks/configuration.yml diff --git a/roles/gpu/tasks/configuration.yml b/roles/gpu/tasks/configuration.yml deleted file mode 100644 index eb5e758a5..000000000 --- a/roles/gpu/tasks/configuration.yml +++ /dev/null @@ -1,23 +0,0 @@ ---- -- name: Copy blacklist-nouveau.conf file into modprobe.d to disable Nouveau drivers - ansible.builtin.copy: - src: blacklist-nouveau.conf - dest: /etc/modprobe.d/blacklist-nouveau.conf - owner: root - group: root - mode: '0644' - become: true - -- name: Install NVidia persistence service - ansible.builtin.copy: - src: nvidia-persistenced.service - dest: /etc/systemd/system/nvidia-persistenced.service - become: true - -- name: Enable a nvidia-persistenced service - ansible.builtin.systemd: - name: nvidia-persistenced.service - state: started - enabled: true - become: true -... diff --git a/roles/gpu/tasks/gpu.yml b/roles/gpu/tasks/gpu.yml index 5aab78601..d167edfb0 100644 --- a/roles/gpu/tasks/gpu.yml +++ b/roles/gpu/tasks/gpu.yml @@ -17,17 +17,6 @@ - wget become: true -# # ansible_kernel variable is not working, as after reboot, still holds old kernel -# - name: Get current kernel version -# ansible.builtin.command: '/usr/bin/uname -r' -# register: uname_output -# failed_when: uname_output.rc != 0 -# when: true -# become: true -# - name: Set kernel version fact -# ansible.builtin.set_fact: -# kernel_version: "{{ uname_output.stdout }}" - - name: Gather facts to get the latest kernel version ansible.builtin.setup: become: true @@ -54,11 +43,11 @@ when: true become: true -# - name: Remove installation file -# ansible.builtin.file: -# path: '/root/{{ gpu_runfile }}' -# state: absent -# become: true +- name: Remove installation file + ansible.builtin.file: + path: '/root/{{ gpu_runfile }}' + state: absent + become: true - name: Copy blacklist-nouveau.conf file into modprobe.d to disable Nouveau drivers ansible.builtin.copy: @@ -70,21 +59,6 @@ become: true notify: 'reboot_server' -# - name: Install NVidia persistence service -# ansible.builtin.copy: -# src: nvidia-persistenced.service -# dest: /etc/systemd/system/nvidia-persistenced.service -# become: true -# notify: 'reboot_server' - -# - name: Enable a nvidia-persistenced service -# ansible.builtin.systemd: -# name: nvidia-persistenced.service -# state: started -# enabled: true -# become: true -# notify: 'reboot_server' - - name: Enforce reboot, so that we can check if drivers are correctly installed ansible.builtin.meta: flush_handlers @@ -96,12 +70,12 @@ changed_when: false become: true -- name: Check that nvidia-smi sees all devices +- name: Final check to confirm all devices are working ansible.builtin.command: 'nvidia-smi -L' register: smi when: true changed_when: false - failed_when: ( smi.rc > 0) or - ( lspci.stdout|lower|regex_findall('nvidia')|length != smi.stdout|lower|regex_findall('nvidia')|length ) + failed_when: ( smi.rc != 0) or + ( smi.stdout|default([])|lower|regex_findall('nvidia')|length != gpu_count ) become: true ... diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml index 13bad47f4..07e932dd4 100644 --- a/roles/gpu/tasks/main.yml +++ b/roles/gpu/tasks/main.yml @@ -10,13 +10,6 @@ - name: Flush handlers ansible.builtin.meta: flush_handlers -# - name: Check if we have CUDA capable system -# ansible.builtin.command: 'lspci' -# register: lspci -# changed_when: false -# when: true -# become: true - - name: Check if we have already configured NVidia devices ansible.builtin.command: 'nvidia-smi -L' register: smi From 487f7532efb2cbf9eaaf2e1b7f9b6b0f4b4c842c Mon Sep 17 00:00:00 2001 From: Pieter Neerincx Date: Mon, 7 Nov 2022 16:12:03 +0100 Subject: [PATCH 11/30] Updated README.md for Python dependency issue on macOS. Disabled use_ssh_args for synchronize tasks due to incompatibility issue with latest mitogen release. --- README.md | 5 +++++ roles/cluster/tasks/main.yml | 2 +- roles/figlet_motd/tasks/main.yml | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0e97622cd..e9cb07d39 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,11 @@ pip3 install 'openstacksdk<0.99' pip3 install ruamel.yaml pip3 install netaddr # +# On macOS only to prevent this error: +# crypt.crypt not supported on Mac OS X/Darwin, install passlib python module. +# +pip3 install passlib +# # Optional: install Ansible with pip. # You may skip this step if you already installed Ansible by other means. # E.g. with HomeBrew on macOS, with yum or dnf on Linux, etc. diff --git a/roles/cluster/tasks/main.yml b/roles/cluster/tasks/main.yml index 43e2bbe52..ada09d976 100644 --- a/roles/cluster/tasks/main.yml +++ b/roles/cluster/tasks/main.yml @@ -20,7 +20,7 @@ dest: '/etc/skel/' owner: 'no' group: 'no' - use_ssh_args: true + #use_ssh_args: true # Temporarily disabled as it is broken in Mitogen 0.3.3. Fix is already merged and will be in next Mitogen version. ssh_connection_multiplexing: true rsync_opts: # --omit-dir-times Is required to prevent "sync error: some files/attrs were not transferred" diff --git a/roles/figlet_motd/tasks/main.yml b/roles/figlet_motd/tasks/main.yml index 3af821f25..7f83093b9 100644 --- a/roles/figlet_motd/tasks/main.yml +++ b/roles/figlet_motd/tasks/main.yml @@ -12,7 +12,7 @@ dest: "/usr/share/figlet/{{ item }}" owner: false group: false - use_ssh_args: true + #use_ssh_args: true # Temporarily disabled as it is broken in Mitogen 0.3.3. Fix is already merged and will be in next Mitogen version. ssh_connection_multiplexing: true rsync_opts: - '--chmod=Fu=rw,Fgo=r' From b58b38ccd5e1c306ad4fd9e0edd9ba801103e42c Mon Sep 17 00:00:00 2001 From: Pieter Neerincx Date: Mon, 7 Nov 2022 16:13:32 +0100 Subject: [PATCH 12/30] Patch original docker.service file for systemd as opposed to the symlink, which is only present when the service is enabled. --- roles/docker/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/docker/tasks/main.yml b/roles/docker/tasks/main.yml index 3cfb8a636..15f97d78b 100644 --- a/roles/docker/tasks/main.yml +++ b/roles/docker/tasks/main.yml @@ -30,7 +30,7 @@ - name: Update docker service to reload every time after iptables service ansible.builtin.blockinfile: - path: /etc/systemd/system/multi-user.target.wants/docker.service + path: /usr/lib/systemd/system/docker.service insertafter: '\[Unit\]' block: | After=iptables.service From 792411043c117ac881b6473634571dc4c8f32640 Mon Sep 17 00:00:00 2001 From: Pieter Neerincx Date: Mon, 7 Nov 2022 16:23:17 +0100 Subject: [PATCH 13/30] Fixed complaint from yaml linter. --- roles/cluster/tasks/main.yml | 3 ++- roles/figlet_motd/tasks/main.yml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/roles/cluster/tasks/main.yml b/roles/cluster/tasks/main.yml index ada09d976..7b970d845 100644 --- a/roles/cluster/tasks/main.yml +++ b/roles/cluster/tasks/main.yml @@ -20,7 +20,8 @@ dest: '/etc/skel/' owner: 'no' group: 'no' - #use_ssh_args: true # Temporarily disabled as it is broken in Mitogen 0.3.3. Fix is already merged and will be in next Mitogen version. + # Temporarily disabled as it is broken in Mitogen 0.3.3. Fix is already merged and will be in next Mitogen version. + # use_ssh_args: true ssh_connection_multiplexing: true rsync_opts: # --omit-dir-times Is required to prevent "sync error: some files/attrs were not transferred" diff --git a/roles/figlet_motd/tasks/main.yml b/roles/figlet_motd/tasks/main.yml index 7f83093b9..4f043971f 100644 --- a/roles/figlet_motd/tasks/main.yml +++ b/roles/figlet_motd/tasks/main.yml @@ -12,7 +12,8 @@ dest: "/usr/share/figlet/{{ item }}" owner: false group: false - #use_ssh_args: true # Temporarily disabled as it is broken in Mitogen 0.3.3. Fix is already merged and will be in next Mitogen version. + # Temporarily disabled as it is broken in Mitogen 0.3.3. Fix is already merged and will be in next Mitogen version. + # use_ssh_args: true ssh_connection_multiplexing: true rsync_opts: - '--chmod=Fu=rw,Fgo=r' From 621389f10f747b4fc91260988f74b47fc9fdab81 Mon Sep 17 00:00:00 2001 From: scimerman Date: Fri, 18 Nov 2022 17:50:27 +0100 Subject: [PATCH 14/30] GPU role: removed uneeded commands --- roles/gpu/tasks/gpu.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/roles/gpu/tasks/gpu.yml b/roles/gpu/tasks/gpu.yml index d167edfb0..bcceb6dab 100644 --- a/roles/gpu/tasks/gpu.yml +++ b/roles/gpu/tasks/gpu.yml @@ -62,14 +62,6 @@ - name: Enforce reboot, so that we can check if drivers are correctly installed ansible.builtin.meta: flush_handlers -- name: Count the number of available nvidia devices - # ansible.builtin.command: '/usr/bin/ls /dev/nvidia[0-9]{,[0-9]' - ansible.builtin.command: 'lspci' - register: lspci - when: true - changed_when: false - become: true - - name: Final check to confirm all devices are working ansible.builtin.command: 'nvidia-smi -L' register: smi From b2480e74076c9a62adc90cb6317901681a6e92c7 Mon Sep 17 00:00:00 2001 From: scimerman Date: Fri, 18 Nov 2022 18:09:15 +0100 Subject: [PATCH 15/30] GPU update --- roles/gpu/README.md | 38 ++++++++++++++++++++++++++++---------- roles/gpu/tasks/main.yml | 2 +- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/roles/gpu/README.md b/roles/gpu/README.md index 42da98e07..16ceaea9b 100644 --- a/roles/gpu/README.md +++ b/roles/gpu/README.md @@ -4,17 +4,35 @@ This role follows the latest instructions of the newest version of available drivers, avaiable at [NVIDIA CUDA Installation Guide for Linux](https://docs.nvidia.com/cuda/pdf/CUDA_Installation_Guide_Linux.pdf). +The driver can be installed via yum repository, but the version limiting and +driver version control is quite hard to implement. Therefore the driver is +installed by downloading and running the cuda .run file. +Driver is installed and compiled as Dynamic Kernel Module Support and will +rebuild with every new kernel instalation. + + ## Role outline -- installs `pciutils` tools -- checks if there is pci device from nvidia and if there is, then it -- installs on system needed yum packages that can later build the driver -- downloads the .run driver from nvidia (driver version is defined in defualts) -- installs and compile the driver module +- it expects the gpu_count to be defined per invididual machine +- attempts to gather the GPU device status by running `nvidia-smi` command +- install the GPU driver if + - `nvidia-smi` command is not available (cuda driver was not installed) + - `nvidia-smi` reports different number of GPU devices than expected from `gpu_count` +- yum install on machine packages that is needed for driver install and compile +- downloads the cuda .run driver file from nvidia website (version defined in defualts) +- installs and compile the Dynamic Kernel Module Support driver - blacklists nouveau -- installs systemd service file, that automatically loads the driver upons system - boot, and that reloads the driver when/if it has failed operating +- reboots the machine +- checks if number of GPU devices reported from `nvidia-smi` is same as in `gpu_count` + +## Other comments + + - The smaller Nvidia .run driver installation file is also avaialble, but then + number of commands and options are missing on system (for example `nvidia-smi`) + - The long term availablitiy of .run file on nvidia website is not of concern as + the cuda archive website is in 2022 still containing the old versions from 2007 + - driver installation vial yum repository is difficult from two reasons: + - first the version needs to be limitied for nvidia-driver rpm and 15 other packages + - not all old versions are available on repository, only recent ones -## TO-DO -- extensive testing and benchmarking -- role for development software installation +[cuda archive website](https://developer.nvidia.com/cuda-toolkit-archive) diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml index 07e932dd4..d602ae9ee 100644 --- a/roles/gpu/tasks/main.yml +++ b/roles/gpu/tasks/main.yml @@ -7,7 +7,7 @@ become: true notify: reboot_server -- name: Flush handlers +- name: Reboot system if needed ansible.builtin.meta: flush_handlers - name: Check if we have already configured NVidia devices From 0f1f46ff88fc3f1c2ef3a7134cac628190542557 Mon Sep 17 00:00:00 2001 From: scimerman Date: Mon, 21 Nov 2022 12:03:47 +0100 Subject: [PATCH 16/30] GPU: reinstated nvidia persistenced --- roles/gpu/README.md | 20 +++++++++++++++++--- roles/gpu/defaults/main.yml | 5 ----- roles/gpu/tasks/gpu.yml | 15 +++++++++++++++ 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/roles/gpu/README.md b/roles/gpu/README.md index 16ceaea9b..18fdd90bf 100644 --- a/roles/gpu/README.md +++ b/roles/gpu/README.md @@ -25,14 +25,28 @@ rebuild with every new kernel instalation. - reboots the machine - checks if number of GPU devices reported from `nvidia-smi` is same as in `gpu_count` +## Dead-ends discovered + +`gpu_count` is needed to install the driver, since any other `automatic` detection is +failing sooner or later. To list few: + + - `lspci` found one nvidia device when there were 8, + - `nvidia-smi` reported no device found, when it actually should found some, + - and `nvidia-smi` had up-and-running 3 GPU's when it should be 8 + +This was just while testing, but I can expect more. + +`gpu_count` instead defines the "truth", and can test aginst it, if all the GPUs +are actually working or not. + ## Other comments - The smaller Nvidia .run driver installation file is also avaialble, but then number of commands and options are missing on system (for example `nvidia-smi`) - The long term availablitiy of .run file on nvidia website is not of concern as the cuda archive website is in 2022 still containing the old versions from 2007 - - driver installation vial yum repository is difficult from two reasons: - - first the version needs to be limitied for nvidia-driver rpm and 15 other packages - - not all old versions are available on repository, only recent ones + - driver installation vial yum repository is harder to implement for two reasons: + - the version needs to be limitied for nvidia-driver rpm and 15 (!) other packages + - it seems that not all old versions are available on repository, only 'recent' ones [cuda archive website](https://developer.nvidia.com/cuda-toolkit-archive) diff --git a/roles/gpu/defaults/main.yml b/roles/gpu/defaults/main.yml index ff0bd0357..fd17ba764 100644 --- a/roles/gpu/defaults/main.yml +++ b/roles/gpu/defaults/main.yml @@ -3,9 +3,4 @@ gpu_cuda_version: '11.7.1' gpu_driver_version: '515.65.01' gpu_url_dir: 'https://developer.download.nvidia.com/compute/cuda/{{ gpu_cuda_version }}/local_installers/' gpu_runfile: 'cuda_{{ gpu_cuda_version }}_{{ gpu_driver_version }}_linux.run' - -# gpu_dl_site: 'https://us.download.nvidia.com/tesla' -# gpu_driver_version: '450.80.02' -# gpu_runfile: 'NVIDIA-Linux-x86_64-{{ gpu_driver_version }}.run' -# gpu_url: '{{ gpu_dl_site }}/{{ gpu_driver_version }}/{{ gpu_runfile }}' ... diff --git a/roles/gpu/tasks/gpu.yml b/roles/gpu/tasks/gpu.yml index bcceb6dab..ea0561d2b 100644 --- a/roles/gpu/tasks/gpu.yml +++ b/roles/gpu/tasks/gpu.yml @@ -59,6 +59,21 @@ become: true notify: 'reboot_server' +- name: Install NVidia persistence service + ansible.builtin.copy: + src: nvidia-persistenced.service + dest: /etc/systemd/system/nvidia-persistenced.service + become: true + notify: 'reboot_server' + +- name: Enable a nvidia-persistenced service + ansible.builtin.systemd: + name: nvidia-persistenced.service + state: started + enabled: true + become: true + notify: 'reboot_server' + - name: Enforce reboot, so that we can check if drivers are correctly installed ansible.builtin.meta: flush_handlers From 0b19414fd12c46ff8d9e09510f7fe3e54b659b61 Mon Sep 17 00:00:00 2001 From: scimerman Date: Mon, 21 Nov 2022 12:10:21 +0100 Subject: [PATCH 17/30] GPU: update --- README.md | 5 ----- roles/gpu/handlers/main.yml | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 5f944397f..6173c3339 100644 --- a/README.md +++ b/README.md @@ -135,11 +135,6 @@ pip3 install dnspython # Required for Ansible lookup plugin community.general.d # pip3 install passlib # -# On macOS only to prevent this error: -# crypt.crypt not supported on Mac OS X/Darwin, install passlib python module. -# -pip3 install passlib -# # Optional: install Ansible with pip. # You may skip this step if you already installed Ansible by other means. # E.g. with HomeBrew on macOS, with yum or dnf on Linux, etc. diff --git a/roles/gpu/handlers/main.yml b/roles/gpu/handlers/main.yml index 356b88410..d8631425d 100644 --- a/roles/gpu/handlers/main.yml +++ b/roles/gpu/handlers/main.yml @@ -1,7 +1,7 @@ --- -- name: Restart server because of the pending updates +- name: Restart server ansible.builtin.reboot: - msg: "Reboot initiated by Ansible, because of the pending updates" + msg: "Reboot initiated by Ansible" listen: "reboot_server" become: true ... From ba0290e2e0b334deb301de90969d3c406cc3432e Mon Sep 17 00:00:00 2001 From: scimerman Date: Mon, 21 Nov 2022 12:15:21 +0100 Subject: [PATCH 18/30] GPU: update --- roles/gpu/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml index d602ae9ee..72c77f7d2 100644 --- a/roles/gpu/tasks/main.yml +++ b/roles/gpu/tasks/main.yml @@ -10,7 +10,7 @@ - name: Reboot system if needed ansible.builtin.meta: flush_handlers -- name: Check if we have already configured NVidia devices +- name: Check how many NVidia devices is up and running (might take some time) ansible.builtin.command: 'nvidia-smi -L' register: smi changed_when: false From ea595cb1ed4dfbf5bdc36644a04ec62730abaaaf Mon Sep 17 00:00:00 2001 From: scimerman Date: Mon, 21 Nov 2022 12:22:16 +0100 Subject: [PATCH 19/30] GPU: ansible-lint fix --- roles/gpu/tasks/main.yml | 2 +- single_role_playbooks/gpu.yml | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml index 72c77f7d2..a05551228 100644 --- a/roles/gpu/tasks/main.yml +++ b/roles/gpu/tasks/main.yml @@ -18,7 +18,7 @@ become: true - name: Install GPU driver if not all GPU devices are present and working - ansible.builtin.include: gpu.yml + ansible.builtin.include_tasks: gpu.yml when: ( gpu_count is defined ) and ( smi.stdout|default([])|lower|regex_findall('nvidia')|length != gpu_count ) ... diff --git a/single_role_playbooks/gpu.yml b/single_role_playbooks/gpu.yml index d8651a6ed..2b39956f5 100644 --- a/single_role_playbooks/gpu.yml +++ b/single_role_playbooks/gpu.yml @@ -1,5 +1,6 @@ --- -- hosts: +- name: GPU installation role + hosts: - compute_vm roles: - gpu From 6fd1bc60bd4b1ecf86a4f0b2991549e96c2ad260 Mon Sep 17 00:00:00 2001 From: scimerman Date: Mon, 21 Nov 2022 12:36:07 +0100 Subject: [PATCH 20/30] GPU: readme --- roles/gpu/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/gpu/README.md b/roles/gpu/README.md index 18fdd90bf..f10894e8f 100644 --- a/roles/gpu/README.md +++ b/roles/gpu/README.md @@ -25,7 +25,7 @@ rebuild with every new kernel instalation. - reboots the machine - checks if number of GPU devices reported from `nvidia-smi` is same as in `gpu_count` -## Dead-ends discovered +## Solved issues - described `gpu_count` is needed to install the driver, since any other `automatic` detection is failing sooner or later. To list few: From e8a43572fced45053b78d5e1b1d4cde6a811561f Mon Sep 17 00:00:00 2001 From: scimerman Date: Tue, 22 Nov 2022 17:31:09 +0100 Subject: [PATCH 21/30] GPU: added nvidia service --- roles/gpu/README.md | 16 ++++++++++ roles/gpu/defaults/main.yml | 5 +++ roles/gpu/handlers/main.yml | 11 ++++++- roles/gpu/tasks/gpu.yml | 43 ++++++++++++++++---------- roles/gpu/tasks/main.yml | 2 +- static_inventories/nibbler_cluster.yml | 2 +- 6 files changed, 60 insertions(+), 19 deletions(-) diff --git a/roles/gpu/README.md b/roles/gpu/README.md index f10894e8f..880a5f08e 100644 --- a/roles/gpu/README.md +++ b/roles/gpu/README.md @@ -22,6 +22,7 @@ rebuild with every new kernel instalation. - downloads the cuda .run driver file from nvidia website (version defined in defualts) - installs and compile the Dynamic Kernel Module Support driver - blacklists nouveau +- creates a local nvidia (defaults UID 601) user - reboots the machine - checks if number of GPU devices reported from `nvidia-smi` is same as in `gpu_count` @@ -39,6 +40,12 @@ This was just while testing, but I can expect more. `gpu_count` instead defines the "truth", and can test aginst it, if all the GPUs are actually working or not. +Persistenced service script was modified based on trial and error, but is taken +mostly from the example files that come with the driver installation, and can be +found in the folder + + /usr/share/doc/NVIDIA_GLX-1.0/samples/nvidia-persistenced-init.tar.bz2 + ## Other comments - The smaller Nvidia .run driver installation file is also avaialble, but then @@ -48,5 +55,14 @@ are actually working or not. - driver installation vial yum repository is harder to implement for two reasons: - the version needs to be limitied for nvidia-driver rpm and 15 (!) other packages - it seems that not all old versions are available on repository, only 'recent' ones + - nvidia advises against using the `persistenced mode` as it is slowly deprecated and + instead reccomends the use of `persistenced daemon` [cuda archive website](https://developer.nvidia.com/cuda-toolkit-archive) + +## Debugging + +To force driver reinstall + - simply change the `gpu_count` from f.e. 8 to 9, and the role will detect the wrong +number of devices, and therefore try to reinstall the driver. + diff --git a/roles/gpu/defaults/main.yml b/roles/gpu/defaults/main.yml index fd17ba764..97096144d 100644 --- a/roles/gpu/defaults/main.yml +++ b/roles/gpu/defaults/main.yml @@ -3,4 +3,9 @@ gpu_cuda_version: '11.7.1' gpu_driver_version: '515.65.01' gpu_url_dir: 'https://developer.download.nvidia.com/compute/cuda/{{ gpu_cuda_version }}/local_installers/' gpu_runfile: 'cuda_{{ gpu_cuda_version }}_{{ gpu_driver_version }}_linux.run' + +nvidia_user: nvidia +nvidia_uid: 601 # a regular user with UID >500 and <1000, but no login +nvidia_group: nvidia +nvidia_gid: 601 ... diff --git a/roles/gpu/handlers/main.yml b/roles/gpu/handlers/main.yml index d8631425d..519fbb4e4 100644 --- a/roles/gpu/handlers/main.yml +++ b/roles/gpu/handlers/main.yml @@ -1,7 +1,16 @@ --- +- name: Enable / restart nvidia-persistenced service + ansible.builtin.systemd: + name: nvidia-persistenced.service + state: restarted + enabled: true + daemon_reload: true + become: true + listen: 'nvidia_service' + - name: Restart server ansible.builtin.reboot: msg: "Reboot initiated by Ansible" - listen: "reboot_server" + listen: 'reboot_server' become: true ... diff --git a/roles/gpu/tasks/gpu.yml b/roles/gpu/tasks/gpu.yml index ea0561d2b..bd3ff0f6f 100644 --- a/roles/gpu/tasks/gpu.yml +++ b/roles/gpu/tasks/gpu.yml @@ -49,28 +49,39 @@ state: absent become: true -- name: Copy blacklist-nouveau.conf file into modprobe.d to disable Nouveau drivers - ansible.builtin.copy: - src: blacklist-nouveau.conf - dest: /etc/modprobe.d/blacklist-nouveau.conf - owner: root - group: root - mode: '0644' +- name: 'Add nvidia group.' + ansible.builtin.group: + name: '{{ nvidia_group }}' + gid: '{{ nvidia_gid }}' + become: true + +- name: 'Add nvidia user.' + ansible.builtin.user: + name: '{{ nvidia_user }}' + uid: '{{ nvidia_uid }}' + group: '{{ nvidia_group }}' + system: true + shell: /sbin/nologin + create_home: false become: true - notify: 'reboot_server' - name: Install NVidia persistence service - ansible.builtin.copy: + ansible.builtin.template: src: nvidia-persistenced.service dest: /etc/systemd/system/nvidia-persistenced.service + owner: root + group: root + mode: '0644' become: true - notify: 'reboot_server' + notify: 'nvidia_service' -- name: Enable a nvidia-persistenced service - ansible.builtin.systemd: - name: nvidia-persistenced.service - state: started - enabled: true +- name: Copy blacklist-nouveau.conf file into modprobe.d to disable Nouveau drivers + ansible.builtin.copy: + src: blacklist-nouveau.conf + dest: /etc/modprobe.d/blacklist-nouveau.conf + owner: root + group: root + mode: '0644' become: true notify: 'reboot_server' @@ -84,5 +95,5 @@ changed_when: false failed_when: ( smi.rc != 0) or ( smi.stdout|default([])|lower|regex_findall('nvidia')|length != gpu_count ) - become: true + become: false # running nvidia-smi as root stops the service ... diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml index a05551228..b1cc3a32e 100644 --- a/roles/gpu/tasks/main.yml +++ b/roles/gpu/tasks/main.yml @@ -15,7 +15,7 @@ register: smi changed_when: false failed_when: false - become: true + become: false # running nvidia-smi as root stops the service - name: Install GPU driver if not all GPU devices are present and working ansible.builtin.include_tasks: gpu.yml diff --git a/static_inventories/nibbler_cluster.yml b/static_inventories/nibbler_cluster.yml index b9f4896e0..fe6a528c1 100644 --- a/static_inventories/nibbler_cluster.yml +++ b/static_inventories/nibbler_cluster.yml @@ -90,7 +90,7 @@ all: deploy_admin_interface: hosts: nb-dai: - cloud_flavor: m1.small + cloud_flavor: m1.large local_volume_size_extra: 3000 user_interface: hosts: From 544d65e313f360a98abde67dae71cb81985a6cd9 Mon Sep 17 00:00:00 2001 From: scimerman Date: Thu, 24 Nov 2022 15:39:14 +0100 Subject: [PATCH 22/30] GPU From 032a596708b51fdeead98ed102d1794cce22ab31 Mon Sep 17 00:00:00 2001 From: scimerman Date: Thu, 24 Nov 2022 15:39:49 +0100 Subject: [PATCH 23/30] GPU: pr update --- roles/gpu/tasks/driver.yml | 63 +++++++++++++++++++ .../gpu/templates/nvidia-persistenced.service | 14 +++++ 2 files changed, 77 insertions(+) create mode 100644 roles/gpu/tasks/driver.yml create mode 100644 roles/gpu/templates/nvidia-persistenced.service diff --git a/roles/gpu/tasks/driver.yml b/roles/gpu/tasks/driver.yml new file mode 100644 index 000000000..8130ce37a --- /dev/null +++ b/roles/gpu/tasks/driver.yml @@ -0,0 +1,63 @@ +--- +- name: Check if system needs to be restarted + ansible.builtin.command: '/bin/needs-restarting -r' + register: needs_restarting + failed_when: 'needs_restarting.rc > 1' + changed_when: 'needs_restarting.rc == 1' + become: true + notify: reboot_server + +- name: Reboot system if needed + ansible.builtin.meta: flush_handlers + +- name: Install yum requirements for gpu driver installation + ansible.builtin.yum: + state: 'installed' + update_cache: true + name: + - tar + - bzip2 + - make + - automake + - gcc + - gcc-c++ + - pciutils + - elfutils-libelf-devel + - libglvnd-devel + - bind-utils + - wget + become: true + +- name: Gather facts to get the latest kernel version + ansible.builtin.setup: + become: true + +- name: Install kernel developement package matching running kernel version + ansible.builtin.yum: + name: 'kernel-devel-{{ ansible_kernel }}' + register: yum_result + failed_when: yum_result.rc != 0 + when: true + become: true + +- name: Download a driver installation file from NVidia + ansible.builtin.get_url: + url: '{{ gpu_url_dir }}/{{ gpu_runfile }}' + dest: '/root/{{ gpu_runfile }}' + mode: '0700' + become: true + +- name: Install driver from .run file + ansible.builtin.command: '/root/{{ gpu_runfile }} --silent --driver' + register: install_result + failed_when: install_result.rc != 0 + when: true + become: true + +- name: Remove installation file + ansible.builtin.file: + path: '/root/{{ gpu_runfile }}' + state: absent + become: true + +... diff --git a/roles/gpu/templates/nvidia-persistenced.service b/roles/gpu/templates/nvidia-persistenced.service new file mode 100644 index 000000000..2351286e5 --- /dev/null +++ b/roles/gpu/templates/nvidia-persistenced.service @@ -0,0 +1,14 @@ +[Unit] +Description=Initialize GPU at the startup of the system +Before=slurmd.service + +[Service] +ExecStart=/usr/bin/nvidia-persistenced --verbose --user {{ nvidia_user }} +ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced +Type=forking +PIDFile=/var/run/nvidia-persistenced/nvidia-persistenced.pid +RestartSec=15 +Restart=always + +[Install] +WantedBy=multi-user.target From d36765e9ffacc47e797b9616d7eb0cfded663848 Mon Sep 17 00:00:00 2001 From: scimerman Date: Thu, 24 Nov 2022 17:50:23 +0100 Subject: [PATCH 24/30] GPU: refractured --- roles/gpu/README.md | 49 +++++++++++++++++++------------------- roles/gpu/tasks/driver.yml | 17 ++++--------- roles/gpu/tasks/main.yml | 31 ++++++++++++------------ 3 files changed, 45 insertions(+), 52 deletions(-) diff --git a/roles/gpu/README.md b/roles/gpu/README.md index 880a5f08e..f6cff89c5 100644 --- a/roles/gpu/README.md +++ b/roles/gpu/README.md @@ -7,24 +7,29 @@ Linux](https://docs.nvidia.com/cuda/pdf/CUDA_Installation_Guide_Linux.pdf). The driver can be installed via yum repository, but the version limiting and driver version control is quite hard to implement. Therefore the driver is installed by downloading and running the cuda .run file. -Driver is installed and compiled as Dynamic Kernel Module Support and will -rebuild with every new kernel instalation. + +The driver features Dynamic Kernel Module Support (DKMS) and will be recompiled +automatically when a new kernel is installed. ## Role outline -- it expects the gpu_count to be defined per invididual machine -- attempts to gather the GPU device status by running `nvidia-smi` command -- install the GPU driver if - - `nvidia-smi` command is not available (cuda driver was not installed) - - `nvidia-smi` reports different number of GPU devices than expected from `gpu_count` -- yum install on machine packages that is needed for driver install and compile -- downloads the cuda .run driver file from nvidia website (version defined in defualts) -- installs and compile the Dynamic Kernel Module Support driver -- blacklists nouveau -- creates a local nvidia (defaults UID 601) user -- reboots the machine -- checks if number of GPU devices reported from `nvidia-smi` is same as in `gpu_count` +- it expects `gpu_count` variable to be defined per invididual machine, and then + - it attempts to gather the GPU device status by running `nvidia-smi` command + - it detects the NVidia driver version + - executes the GPU driver installation tasks + - checks if machine needs to be rebooted and reboots it, if needed + - yum install on machine packages that is needed for driver install and compile + - yum also installs a (after a reboot - is correctly matching) version of kernel + - downloads the cuda .run driver file from nvidia website (version defined in defualts) + - installs and compile the Dynamic Kernel Module Support driver + - services tasks are deployed on all machines with `gpu_count` defined + - creates a local nvidia (defaults GID 601) group + - creates a local nvidia (defaults UID 601) user + - blacklists nouveau + - installs `nvidia-persistenced.service` file, that will be executed as nvidia user + - reboots the machine + - checks if number of GPU devices reported from `nvidia-smi` is same as in `gpu_count` ## Solved issues - described @@ -37,8 +42,8 @@ failing sooner or later. To list few: This was just while testing, but I can expect more. -`gpu_count` instead defines the "truth", and can test aginst it, if all the GPUs -are actually working or not. +`gpu_count` instead defines the correct "truth", and can test aginst it - that is +if all the GPUs are actually working correctly. Persistenced service script was modified based on trial and error, but is taken mostly from the example files that come with the driver installation, and can be @@ -48,21 +53,15 @@ found in the folder ## Other comments - - The smaller Nvidia .run driver installation file is also avaialble, but then + - The smaller Nvidia .run driver installation file is also avaialable, but then number of commands and options are missing on system (for example `nvidia-smi`) - The long term availablitiy of .run file on nvidia website is not of concern as the cuda archive website is in 2022 still containing the old versions from 2007 - - driver installation vial yum repository is harder to implement for two reasons: + - driver installation is possible via yum repository, but it is harder to implement + for two reasons: - the version needs to be limitied for nvidia-driver rpm and 15 (!) other packages - it seems that not all old versions are available on repository, only 'recent' ones - nvidia advises against using the `persistenced mode` as it is slowly deprecated and instead reccomends the use of `persistenced daemon` [cuda archive website](https://developer.nvidia.com/cuda-toolkit-archive) - -## Debugging - -To force driver reinstall - - simply change the `gpu_count` from f.e. 8 to 9, and the role will detect the wrong -number of devices, and therefore try to reinstall the driver. - diff --git a/roles/gpu/tasks/driver.yml b/roles/gpu/tasks/driver.yml index 8130ce37a..7997d3cff 100644 --- a/roles/gpu/tasks/driver.yml +++ b/roles/gpu/tasks/driver.yml @@ -10,11 +10,16 @@ - name: Reboot system if needed ansible.builtin.meta: flush_handlers +- name: Gather facts to get the latest kernel version + ansible.builtin.setup: + become: true + - name: Install yum requirements for gpu driver installation ansible.builtin.yum: state: 'installed' update_cache: true name: + - 'kernel-devel-{{ ansible_kernel }}' - tar - bzip2 - make @@ -28,18 +33,6 @@ - wget become: true -- name: Gather facts to get the latest kernel version - ansible.builtin.setup: - become: true - -- name: Install kernel developement package matching running kernel version - ansible.builtin.yum: - name: 'kernel-devel-{{ ansible_kernel }}' - register: yum_result - failed_when: yum_result.rc != 0 - when: true - become: true - - name: Download a driver installation file from NVidia ansible.builtin.get_url: url: '{{ gpu_url_dir }}/{{ gpu_runfile }}' diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml index b1cc3a32e..dca66a7c0 100644 --- a/roles/gpu/tasks/main.yml +++ b/roles/gpu/tasks/main.yml @@ -1,24 +1,25 @@ --- -- name: Check if system needs to be restarted - ansible.builtin.command: '/bin/needs-restarting -r' - register: needs_restarting - failed_when: 'needs_restarting.rc > 1' - changed_when: 'needs_restarting.rc == 1' - become: true - notify: reboot_server - -- name: Reboot system if needed - ansible.builtin.meta: flush_handlers - - name: Check how many NVidia devices is up and running (might take some time) ansible.builtin.command: 'nvidia-smi -L' register: smi + when: gpu_count|default(0) >= 1 + changed_when: false + failed_when: false + +- name: Check driver version + ansible.builtin.command: '/usr/sbin/modinfo nvidia' + register: modinfo changed_when: false failed_when: false - become: false # running nvidia-smi as root stops the service + when: gpu_count|default(0) >= 1 - name: Install GPU driver if not all GPU devices are present and working - ansible.builtin.include_tasks: gpu.yml - when: ( gpu_count is defined ) and - ( smi.stdout|default([])|lower|regex_findall('nvidia')|length != gpu_count ) + ansible.builtin.include_tasks: driver.yml + when: gpu_count|default(0) >= 1 and + (( smi.stdout|default([])|lower|regex_findall('nvidia')|length != gpu_count ) or + gpu_driver_version not in modinfo.stdout|default("")|regex_search("version:.*")) + +- name: Configure user and services + ansible.builtin.include_tasks: user_services.yml + when: gpu_count|default(0) >= 1 ... From 3542c16e2e8dd4bda9b04ca80ff30929e4c52364 Mon Sep 17 00:00:00 2001 From: scimerman Date: Fri, 25 Nov 2022 11:24:23 +0100 Subject: [PATCH 25/30] GPU: added user services and removed gpu --- .../gpu/tasks/{gpu.yml => user_services.yml} | 51 ------------------- 1 file changed, 51 deletions(-) rename roles/gpu/tasks/{gpu.yml => user_services.yml} (50%) diff --git a/roles/gpu/tasks/gpu.yml b/roles/gpu/tasks/user_services.yml similarity index 50% rename from roles/gpu/tasks/gpu.yml rename to roles/gpu/tasks/user_services.yml index bd3ff0f6f..559337393 100644 --- a/roles/gpu/tasks/gpu.yml +++ b/roles/gpu/tasks/user_services.yml @@ -1,54 +1,4 @@ --- -- name: Install yum requirements for gpu driver installation - ansible.builtin.yum: - state: 'installed' - update_cache: true - name: - - tar - - bzip2 - - make - - automake - - gcc - - gcc-c++ - - pciutils - - elfutils-libelf-devel - - libglvnd-devel - - bind-utils - - wget - become: true - -- name: Gather facts to get the latest kernel version - ansible.builtin.setup: - become: true - -- name: Install kernel developement package matching running kernel version - ansible.builtin.yum: - name: 'kernel-devel-{{ ansible_kernel }}' - register: yum_result - failed_when: yum_result.rc != 0 - when: true - become: true - -- name: Download a driver installation file from NVidia - ansible.builtin.get_url: - url: '{{ gpu_url_dir }}/{{ gpu_runfile }}' - dest: '/root/{{ gpu_runfile }}' - mode: '0700' - become: true - -- name: Install driver from .run file - ansible.builtin.command: '/root/{{ gpu_runfile }} --silent --driver' - register: install_result - failed_when: install_result.rc != 0 - when: true - become: true - -- name: Remove installation file - ansible.builtin.file: - path: '/root/{{ gpu_runfile }}' - state: absent - become: true - - name: 'Add nvidia group.' ansible.builtin.group: name: '{{ nvidia_group }}' @@ -95,5 +45,4 @@ changed_when: false failed_when: ( smi.rc != 0) or ( smi.stdout|default([])|lower|regex_findall('nvidia')|length != gpu_count ) - become: false # running nvidia-smi as root stops the service ... From e670846de81529f314bfafbfd538758a6ed7a00b Mon Sep 17 00:00:00 2001 From: scimerman Date: Fri, 25 Nov 2022 11:27:32 +0100 Subject: [PATCH 26/30] GPU: services renamed to configuration --- roles/gpu/tasks/{user_services.yml => configuration.yml} | 0 roles/gpu/tasks/main.yml | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename roles/gpu/tasks/{user_services.yml => configuration.yml} (100%) diff --git a/roles/gpu/tasks/user_services.yml b/roles/gpu/tasks/configuration.yml similarity index 100% rename from roles/gpu/tasks/user_services.yml rename to roles/gpu/tasks/configuration.yml diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml index dca66a7c0..ab816d459 100644 --- a/roles/gpu/tasks/main.yml +++ b/roles/gpu/tasks/main.yml @@ -19,7 +19,7 @@ (( smi.stdout|default([])|lower|regex_findall('nvidia')|length != gpu_count ) or gpu_driver_version not in modinfo.stdout|default("")|regex_search("version:.*")) -- name: Configure user and services - ansible.builtin.include_tasks: user_services.yml +- name: Configure GPU - users, files and services + ansible.builtin.include_tasks: configuration.yml when: gpu_count|default(0) >= 1 ... From 08b539bf61124378fd2a0743b1d9ac72f01ff7c8 Mon Sep 17 00:00:00 2001 From: scimerman Date: Fri, 25 Nov 2022 11:30:11 +0100 Subject: [PATCH 27/30] GPU: updated readme --- roles/gpu/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/gpu/README.md b/roles/gpu/README.md index f6cff89c5..4e58f1c3e 100644 --- a/roles/gpu/README.md +++ b/roles/gpu/README.md @@ -23,7 +23,7 @@ automatically when a new kernel is installed. - yum also installs a (after a reboot - is correctly matching) version of kernel - downloads the cuda .run driver file from nvidia website (version defined in defualts) - installs and compile the Dynamic Kernel Module Support driver - - services tasks are deployed on all machines with `gpu_count` defined + - execute configuration if `gpu_count` defined - creates a local nvidia (defaults GID 601) group - creates a local nvidia (defaults UID 601) user - blacklists nouveau From 0beed5c1d8ed846ae71bf18dd746a0265aa683b8 Mon Sep 17 00:00:00 2001 From: scimerman Date: Fri, 25 Nov 2022 11:32:04 +0100 Subject: [PATCH 28/30] GPU: removed stale file --- roles/gpu/files/nvidia-persistenced.service | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 roles/gpu/files/nvidia-persistenced.service diff --git a/roles/gpu/files/nvidia-persistenced.service b/roles/gpu/files/nvidia-persistenced.service deleted file mode 100644 index 85df2bdaa..000000000 --- a/roles/gpu/files/nvidia-persistenced.service +++ /dev/null @@ -1,10 +0,0 @@ -[Unit] -Description=Initialize GPU at the startup of the system - -[Service] -ExecStart=/usr/bin/nvidia-persistenced --verbose -RestartSec=15 -Restart=always - -[Install] -WantedBy=multi-user.target From 0b07cf1091e626adc518f25ab7b75762b0fddc1c Mon Sep 17 00:00:00 2001 From: scimerman Date: Thu, 8 Dec 2022 17:04:28 +0100 Subject: [PATCH 29/30] gpu: added node 05 --- static_inventories/nibbler_cluster.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static_inventories/nibbler_cluster.yml b/static_inventories/nibbler_cluster.yml index fe6a528c1..c5b1779bc 100644 --- a/static_inventories/nibbler_cluster.yml +++ b/static_inventories/nibbler_cluster.yml @@ -125,7 +125,7 @@ all: - eth1 gpu_a40: # Must be item from {{ slurm_partitions }} variable defined in group_vars/{{ stack_name }}/vars.yml hosts: - nb-vcompute04: + nb-vcompute[04:05] vars: cloud_flavor: gpu.A40_8 gpu_count: 8 From a3e2e894dc1e3236156b530ff7dea07b89e3e453 Mon Sep 17 00:00:00 2001 From: scimerman Date: Thu, 8 Dec 2022 17:29:45 +0100 Subject: [PATCH 30/30] GPU: gpu node 05, added ip addresses --- group_vars/nibbler_cluster/ip_addresses.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/group_vars/nibbler_cluster/ip_addresses.yml b/group_vars/nibbler_cluster/ip_addresses.yml index f163d7f98..08b0bc598 100644 --- a/group_vars/nibbler_cluster/ip_addresses.yml +++ b/group_vars/nibbler_cluster/ip_addresses.yml @@ -81,6 +81,13 @@ ip_addresses: nb_internal_storage: address: 10.10.2.129 netmask: /32 + nb-vcompute05: + nb_internal_management: + address: 10.10.1.229 + netmask: /32 + nb_internal_storage: + address: 10.10.2.39 + netmask: /32 nibbler: nb_internal_management: address: 10.10.1.112