Merge pull request #670 from scimerman/gpu

GPU role (first part)
rug-cit-hpc · Jan 4, 2023 · d34e0a9 · d34e0a9
2 parents 491346e + 5543bd8
commit d34e0a9
Show file tree

Hide file tree

Showing 13 changed files with 261 additions and 4 deletions.
diff --git a/group_vars/nibbler_cluster/ip_addresses.yml b/group_vars/nibbler_cluster/ip_addresses.yml
@@ -81,6 +81,13 @@ ip_addresses:
     nb_internal_storage:
       address: 10.10.2.129
       netmask: /32
+  nb-vcompute05:
+    nb_internal_management:
+      address: 10.10.1.229
+      netmask: /32
+    nb_internal_storage:
+      address: 10.10.2.39
+      netmask: /32
   nibbler:
     nb_internal_management:
       address: 10.10.1.112

diff --git a/roles/cluster/defaults/main.yml b/roles/cluster/defaults/main.yml
@@ -22,6 +22,7 @@ cluster_common_packages:
   - ncurses-static
   - net-tools
   - openssl
+  - pciutils
   - openssl11  # Required for openldap-ltb RPMs.
   - qt5-qtbase
   - qt5-qtxmlpatterns
@@ -37,4 +38,5 @@ cluster_common_packages:
   - urw-base35-fonts
   - vim
   - wget
+  - yum-utils
 ...
diff --git a/roles/gpu/README.md b/roles/gpu/README.md
@@ -0,0 +1,67 @@
+# NVidia GPU installation role for Centos 7
+
+This role follows the latest instructions of the newest version of available
+drivers, avaiable at [NVIDIA CUDA Installation Guide for
+Linux](https://docs.nvidia.com/cuda/pdf/CUDA_Installation_Guide_Linux.pdf).
+
+The driver can be installed via yum repository, but the version limiting and
+driver version control is quite hard to implement. Therefore the driver is
+installed by downloading and running the cuda .run file.
+
+The driver features Dynamic Kernel Module Support (DKMS) and will be recompiled
+automatically when a new kernel is installed.
+
+
+## Role outline
+
+- it expects `gpu_count` variable to be defined per invididual machine, and then
+  - it attempts to gather the GPU device status by running `nvidia-smi` command
+  - it detects the NVidia driver version
+  - executes the GPU driver installation tasks
+    - checks if machine needs to be rebooted and reboots it, if needed
+    - yum install on machine packages that is needed for driver install and compile
+    - yum also installs a (after a reboot - is correctly matching) version of kernel
+    - downloads the cuda .run driver file from nvidia website (version defined in defualts)
+    - installs and compile the Dynamic Kernel Module Support driver
+  - execute configuration if `gpu_count` defined
+    - creates a local nvidia (defaults GID 601) group
+    - creates a local nvidia (defaults UID 601) user
+    - blacklists nouveau
+    - installs `nvidia-persistenced.service` file, that will be executed as nvidia user
+    - reboots the machine
+    - checks if number of GPU devices reported from `nvidia-smi` is same as in `gpu_count`
+
+## Solved issues - described
+
+`gpu_count` is needed to install the driver, since any other `automatic` detection is
+failing sooner or later. To list few:
+
+ - `lspci` found one nvidia device when there were 8,
+ - `nvidia-smi` reported no device found, when it actually should found some,
+ - and `nvidia-smi` had up-and-running 3 GPU's when it should be 8
+
+This was just while testing, but I can expect more.
+
+`gpu_count` instead defines the correct "truth", and can test aginst it - that is
+if all the GPUs are actually working correctly.
+
+Persistenced service script was modified based on trial and error, but is taken
+mostly from the example files that come with the driver installation, and can be
+found in the folder 
+
+    /usr/share/doc/NVIDIA_GLX-1.0/samples/nvidia-persistenced-init.tar.bz2
+
+## Other comments
+
+ - The smaller Nvidia .run driver installation file is also avaialable, but then
+   number of commands and options are missing on system (for example `nvidia-smi`)
+ - The long term availablitiy of .run file on nvidia website is not of concern as
+   the cuda archive website is in 2022 still containing the old versions from 2007
+ - driver installation is possible via yum repository, but it is harder to implement
+   for two reasons:
+    - the version needs to be limitied for nvidia-driver rpm and 15 (!) other packages
+    - it seems that not all old versions are available on repository, only 'recent' ones
+ - nvidia advises against using the `persistenced mode` as it is slowly deprecated and
+   instead reccomends the use of `persistenced daemon`
+
+[cuda archive website](https://developer.nvidia.com/cuda-toolkit-archive)
diff --git a/roles/gpu/defaults/main.yml b/roles/gpu/defaults/main.yml
@@ -0,0 +1,11 @@
+---
+gpu_cuda_version: '11.7.1'
+gpu_driver_version: '515.65.01'
+gpu_url_dir: 'https://developer.download.nvidia.com/compute/cuda/{{ gpu_cuda_version }}/local_installers/'
+gpu_runfile: 'cuda_{{ gpu_cuda_version }}_{{ gpu_driver_version }}_linux.run'
+
+nvidia_user: nvidia
+nvidia_uid: 601       # a regular user with UID >500 and <1000, but no login
+nvidia_group: nvidia
+nvidia_gid: 601
+...
diff --git a/roles/gpu/files/blacklist-nouveau.conf b/roles/gpu/files/blacklist-nouveau.conf
@@ -0,0 +1,2 @@
+blacklist nouveau
+options nouveau modeset=0
diff --git a/roles/gpu/handlers/main.yml b/roles/gpu/handlers/main.yml
@@ -0,0 +1,16 @@
+---
+- name: Enable / restart nvidia-persistenced service
+  ansible.builtin.systemd:
+    name: nvidia-persistenced.service
+    state: restarted
+    enabled: true
+    daemon_reload: true
+  become: true
+  listen: 'nvidia_service'
+
+- name: Restart server
+  ansible.builtin.reboot:
+    msg: "Reboot initiated by Ansible"
+  listen: 'reboot_server'
+  become: true
+...
diff --git a/roles/gpu/tasks/configuration.yml b/roles/gpu/tasks/configuration.yml
@@ -0,0 +1,48 @@
+---
+- name: 'Add nvidia group.'
+  ansible.builtin.group:
+    name: '{{ nvidia_group }}'
+    gid: '{{ nvidia_gid }}'
+  become: true
+
+- name: 'Add nvidia user.'
+  ansible.builtin.user:
+    name: '{{ nvidia_user }}'
+    uid: '{{ nvidia_uid }}'
+    group: '{{ nvidia_group }}'
+    system: true
+    shell: /sbin/nologin
+    create_home: false
+  become: true
+
+- name: Install NVidia persistence service
+  ansible.builtin.template:
+    src: nvidia-persistenced.service
+    dest: /etc/systemd/system/nvidia-persistenced.service
+    owner: root
+    group: root
+    mode: '0644'
+  become: true
+  notify: 'nvidia_service'
+
+- name: Copy blacklist-nouveau.conf file into modprobe.d to disable Nouveau drivers
+  ansible.builtin.copy:
+    src: blacklist-nouveau.conf
+    dest: /etc/modprobe.d/blacklist-nouveau.conf
+    owner: root
+    group: root
+    mode: '0644'
+  become: true
+  notify: 'reboot_server'
+
+- name: Enforce reboot, so that we can check if drivers are correctly installed
+  ansible.builtin.meta: flush_handlers
+
+- name: Final check to confirm all devices are working
+  ansible.builtin.command: 'nvidia-smi -L'
+  register: smi
+  when: true
+  changed_when: false
+  failed_when: ( smi.rc != 0) or
+               ( smi.stdout|default([])|lower|regex_findall('nvidia')|length != gpu_count )
+...
diff --git a/roles/gpu/tasks/driver.yml b/roles/gpu/tasks/driver.yml
@@ -0,0 +1,56 @@
+---
+- name: Check if system needs to be restarted
+  ansible.builtin.command: '/bin/needs-restarting -r'
+  register: needs_restarting
+  failed_when: 'needs_restarting.rc > 1'
+  changed_when: 'needs_restarting.rc == 1'
+  become: true
+  notify: reboot_server
+
+- name: Reboot system if needed
+  ansible.builtin.meta: flush_handlers
+
+- name: Gather facts to get the latest kernel version
+  ansible.builtin.setup:
+  become: true
+
+- name: Install yum requirements for gpu driver installation
+  ansible.builtin.yum:
+    state: 'installed'
+    update_cache: true
+    name:
+      - 'kernel-devel-{{ ansible_kernel }}'
+      - tar
+      - bzip2
+      - make
+      - automake
+      - gcc
+      - gcc-c++
+      - pciutils
+      - elfutils-libelf-devel
+      - libglvnd-devel
+      - bind-utils
+      - wget
+  become: true
+
+- name: Download a driver installation file from NVidia
+  ansible.builtin.get_url:
+    url: '{{ gpu_url_dir }}/{{ gpu_runfile }}'
+    dest: '/root/{{ gpu_runfile }}'
+    mode: '0700'
+  become: true
+
+- name: Install driver from .run file
+  ansible.builtin.command: '/root/{{ gpu_runfile }} --silent --driver'
+  register: install_result
+  failed_when: install_result.rc != 0
+  when: true
+  become: true
+
+- name: Remove installation file
+  ansible.builtin.file:
+    path: '/root/{{ gpu_runfile }}'
+    state: absent
+  become: true
+
+...
diff --git a/roles/gpu/tasks/main.yml b/roles/gpu/tasks/main.yml
@@ -0,0 +1,25 @@
+---
+- name: Check how many NVidia devices is up and running (might take some time)
+  ansible.builtin.command: 'nvidia-smi -L'
+  register: smi
+  when: gpu_count|default(0) >= 1
+  changed_when: false
+  failed_when: false
+
+- name: Check driver version
+  ansible.builtin.command: '/usr/sbin/modinfo nvidia'
+  register: modinfo
+  changed_when: false
+  failed_when: false
+  when: gpu_count|default(0) >= 1
+
+- name: Install GPU driver if not all GPU devices are present and working
+  ansible.builtin.include_tasks: driver.yml
+  when: gpu_count|default(0) >= 1 and
+        (( smi.stdout|default([])|lower|regex_findall('nvidia')|length != gpu_count ) or
+        gpu_driver_version not in modinfo.stdout|default("")|regex_search("version:.*"))
+
+- name: Configure GPU - users, files and services
+  ansible.builtin.include_tasks: configuration.yml
+  when: gpu_count|default(0) >= 1
+...
diff --git a/roles/gpu/templates/nvidia-persistenced.service b/roles/gpu/templates/nvidia-persistenced.service
@@ -0,0 +1,14 @@
+[Unit]
+Description=Initialize GPU at the startup of the system
+Before=slurmd.service
+
+[Service]
+ExecStart=/usr/bin/nvidia-persistenced --verbose --user {{ nvidia_user }}
+ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
+Type=forking
+PIDFile=/var/run/nvidia-persistenced/nvidia-persistenced.pid
+RestartSec=15
+Restart=always
+
+[Install]
+WantedBy=multi-user.target
diff --git a/single_group_playbooks/cluster_part1.yml b/single_group_playbooks/cluster_part1.yml
@@ -23,6 +23,7 @@
     - figlet_motd
     - node_exporter
     - cluster
+    - gpu            # needs to run after role 'cluster'
     - resolver
     - coredumps
 ...
diff --git a/single_role_playbooks/gpu.yml b/single_role_playbooks/gpu.yml
@@ -0,0 +1,7 @@
+---
+- name: GPU installation role
+  hosts:
+    - compute_vm
+  roles:
+    - gpu
+...
diff --git a/static_inventories/nibbler_cluster.yml b/static_inventories/nibbler_cluster.yml
@@ -90,8 +90,8 @@ all:
     deploy_admin_interface:
       hosts:
         nb-dai:
-          cloud_flavor: m1.small
-          local_volume_size_extra: 200
+          cloud_flavor: m1.large
+          local_volume_size_extra: 3000
     user_interface:
       hosts:
         nibbler:
@@ -125,9 +125,10 @@ all:
               - eth1
         gpu_a40:  # Must be item from {{ slurm_partitions }} variable defined in group_vars/{{ stack_name }}/vars.yml
           hosts:
-            nb-vcompute04:
+            nb-vcompute[04:05]
           vars:
-            cloud_flavor: gpu.A40
+            cloud_flavor: gpu.A40_8
+            gpu_count: 8
             local_volume_size_extra: 1
             slurm_sockets: 32
             slurm_cores_per_socket: 1