Skip to content

Commit

Permalink
GPU: pr update
Browse files Browse the repository at this point in the history
  • Loading branch information
scimerman committed Nov 24, 2022
1 parent 544d65e commit 032a596
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 0 deletions.
63 changes: 63 additions & 0 deletions roles/gpu/tasks/driver.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
---
- name: Check if system needs to be restarted
ansible.builtin.command: '/bin/needs-restarting -r'
register: needs_restarting
failed_when: 'needs_restarting.rc > 1'
changed_when: 'needs_restarting.rc == 1'
become: true
notify: reboot_server

- name: Reboot system if needed
ansible.builtin.meta: flush_handlers

- name: Install yum requirements for gpu driver installation
ansible.builtin.yum:
state: 'installed'
update_cache: true
name:
- tar
- bzip2
- make
- automake
- gcc
- gcc-c++
- pciutils
- elfutils-libelf-devel
- libglvnd-devel
- bind-utils
- wget
become: true

- name: Gather facts to get the latest kernel version
ansible.builtin.setup:
become: true

- name: Install kernel developement package matching running kernel version
ansible.builtin.yum:
name: 'kernel-devel-{{ ansible_kernel }}'
register: yum_result
failed_when: yum_result.rc != 0
when: true
become: true

- name: Download a driver installation file from NVidia
ansible.builtin.get_url:
url: '{{ gpu_url_dir }}/{{ gpu_runfile }}'
dest: '/root/{{ gpu_runfile }}'
mode: '0700'
become: true

- name: Install driver from .run file
ansible.builtin.command: '/root/{{ gpu_runfile }} --silent --driver'
register: install_result
failed_when: install_result.rc != 0
when: true
become: true

- name: Remove installation file
ansible.builtin.file:
path: '/root/{{ gpu_runfile }}'
state: absent
become: true

...
14 changes: 14 additions & 0 deletions roles/gpu/templates/nvidia-persistenced.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[Unit]
Description=Initialize GPU at the startup of the system
Before=slurmd.service

[Service]
ExecStart=/usr/bin/nvidia-persistenced --verbose --user {{ nvidia_user }}
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
Type=forking
PIDFile=/var/run/nvidia-persistenced/nvidia-persistenced.pid
RestartSec=15
Restart=always

[Install]
WantedBy=multi-user.target

0 comments on commit 032a596

Please sign in to comment.