From ee017e829d0dd04b6805bac932756a80cac53923 Mon Sep 17 00:00:00 2001 From: Pieter Neerincx Date: Thu, 23 Feb 2023 23:29:43 +0100 Subject: [PATCH 1/2] Improved lustre client config to survive reboots. --- roles/lustre_client/tasks/install.yml | 30 ++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/roles/lustre_client/tasks/install.yml b/roles/lustre_client/tasks/install.yml index 5fc8677be..bd1b32cc8 100644 --- a/roles/lustre_client/tasks/install.yml +++ b/roles/lustre_client/tasks/install.yml @@ -106,15 +106,27 @@ become: true - name: 'Create lustre lnet config: part 2.' + ansible.builtin.command: + cmd: | + lnetctl net del --net tcp + register: lnetctl_del_status + changed_when: + - lnetctl_del_status.rc == 0 + failed_when: + - lnetctl_del_status.rc > 0 + - '"errno: -2" not in lnetctl_del_status.stderr' + become: true + +- name: 'Create lustre lnet config: part 3.' ansible.builtin.command: cmd: | lnetctl net add --net "{{ item.name }}" --if "{{ item.interface }}" - register: lnetctl_status + register: lnetctl_add_status changed_when: - - lnetctl_status.rc == 0 + - lnetctl_add_status.rc == 0 failed_when: - - lnetctl_status.rc > 0 - - '"errno: -17" not in lnetctl_status.stderr' + - lnetctl_add_status.rc > 0 + - '"errno: -17" not in lnetctl_add_status.stderr' with_items: "{{ lustre_client_networks }}" become: true @@ -122,7 +134,7 @@ ansible.builtin.shell: cmd: | checksum_old=$(md5sum /etc/lnet.conf) - lnetctl export /etc/lnet.conf + lnetctl net show > /etc/lnet.conf checksum_new=$(md5sum /etc/lnet.conf) if [[ "${checksum_old}" != "${checksum_new}" ]]; then echo 'lnet.conf has changed.' @@ -131,4 +143,12 @@ changed_when: - '"lnet.conf has changed." in lnetconf_status.stdout' become: true + +- name: Start lnet.service. + ansible.builtin.systemd: + name: lnet.service + enabled: true + state: started + daemon_reload: true + become: true ... From 6e159c2ce79f409fb0fab8ac1c77551286ea7135 Mon Sep 17 00:00:00 2001 From: Pieter Neerincx Date: Sun, 26 Feb 2023 18:25:29 +0100 Subject: [PATCH 2/2] Improved lustre_client: ensure lnet starts and lustre file systems are mounted on (re)boot. --- roles/lustre_client/handlers/main.yml | 24 +++++++++++++++++++ roles/lustre_client/tasks/install.yml | 33 ++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/roles/lustre_client/handlers/main.yml b/roles/lustre_client/handlers/main.yml index 08ee7aa23..fdd4de594 100644 --- a/roles/lustre_client/handlers/main.yml +++ b/roles/lustre_client/handlers/main.yml @@ -3,4 +3,28 @@ ansible.builtin.reboot: become: true listen: reboot + +- name: 'Re-enable lnet.service part 1: disable lnet.service.' + ansible.builtin.systemd: + name: lnet.service + enabled: false + daemon_reload: true + become: true + listen: reenable_lnet + +- name: 'Re-enable lnet.service part 2: enable lnet.service.' + ansible.builtin.systemd: + name: lnet.service + enabled: true + daemon_reload: true + become: true + listen: reenable_lnet + +- name: Restart lnet.service. + ansible.builtin.systemd: + name: lnet.service + state: restarted + daemon_reload: true + become: true + listen: restart_lnet ... diff --git a/roles/lustre_client/tasks/install.yml b/roles/lustre_client/tasks/install.yml index bd1b32cc8..8ec990146 100644 --- a/roles/lustre_client/tasks/install.yml +++ b/roles/lustre_client/tasks/install.yml @@ -134,7 +134,7 @@ ansible.builtin.shell: cmd: | checksum_old=$(md5sum /etc/lnet.conf) - lnetctl net show > /etc/lnet.conf + lnetctl export -b /etc/lnet.conf checksum_new=$(md5sum /etc/lnet.conf) if [[ "${checksum_old}" != "${checksum_new}" ]]; then echo 'lnet.conf has changed.' @@ -142,6 +142,37 @@ register: lnetconf_status changed_when: - '"lnet.conf has changed." in lnetconf_status.stdout' + notify: + - restart_lnet + become: true + +- name: Patch lnet.service file for systemd to ignore failure if a Lustre network ID was already loaded. + ansible.builtin.lineinfile: + path: /usr/lib/systemd/system/lnet.service + backup: true + insertafter: '\[Service\]' + regexp: '^#?SuccessExitStatus=' + line: 'SuccessExitStatus=239' # lustre NID already loaded. + owner: root + group: root + mode: '0644' + notify: + - restart_lnet + become: true + +- name: Patch lnet.service file for systemd to start lnet.service before remote-fs.target. + ansible.builtin.lineinfile: + path: /usr/lib/systemd/system/lnet.service + backup: true + insertafter: '\[Install\]' + regexp: '^#?WantedBy=' + line: 'WantedBy=multi-user.target remote-fs.target' + owner: root + group: root + mode: '0644' + notify: + - reenable_lnet + - restart_lnet become: true - name: Start lnet.service.