From 8e696dee8891ab8459a3f9be06933dc7a6224f89 Mon Sep 17 00:00:00 2001
From: Sander Land <48946947+sanderland@users.noreply.github.com>
Date: Tue, 20 Aug 2024 17:43:54 +0200
Subject: [PATCH 1/2] Update check_nvlink_connectivity

---
 litgpt/utils.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/litgpt/utils.py b/litgpt/utils.py
index 006e67d2cd..74aaf9c26c 100644
--- a/litgpt/utils.py
+++ b/litgpt/utils.py
@@ -628,9 +628,7 @@ def check_nvlink_connectivity(fabric=None):
             gpu_regex = re.compile(r'^GPU\d+$')
             gpu_count = len([header for header in headers if gpu_regex.match(header)])
 
-            for line in lines[start_index:]:
-                if not line.strip():
-                    break
+            for line in lines[start_index:start_index + gpu_count]:
                 gpu_matrix.append(line.strip())
 
             all_nvlink = True
@@ -650,6 +648,3 @@ def check_nvlink_connectivity(fabric=None):
 
         except Exception as e:
             custom_print(f"An error occurred: {e}")
-
-        except Exception as e:
-            custom_print(f"An error occurred: {e}")

From 31eeffb753d19d5a8fd77616b026d70fc4e2cb1e Mon Sep 17 00:00:00 2001
From: rasbt <mail@sebastianraschka.com>
Date: Wed, 21 Aug 2024 14:28:05 +0000
Subject: [PATCH 2/2] update code and unit tests

---
 litgpt/utils.py     |  4 +--
 tests/test_utils.py | 82 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 68 insertions(+), 18 deletions(-)

diff --git a/litgpt/utils.py b/litgpt/utils.py
index 74aaf9c26c..0a929e2a12 100644
--- a/litgpt/utils.py
+++ b/litgpt/utils.py
@@ -628,11 +628,9 @@ def check_nvlink_connectivity(fabric=None):
             gpu_regex = re.compile(r'^GPU\d+$')
             gpu_count = len([header for header in headers if gpu_regex.match(header)])
 
+            all_nvlink = True
             for line in lines[start_index:start_index + gpu_count]:
                 gpu_matrix.append(line.strip())
-
-            all_nvlink = True
-            for line in gpu_matrix:
                 connections = line.split()[1:1 + gpu_count]
                 if not all("NV" in conn for conn in connections if conn != "X"):
                     all_nvlink = False
diff --git a/tests/test_utils.py b/tests/test_utils.py
index bf8a6d2793..d4f6be2076 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -457,16 +457,79 @@ def test_file_size_above_limit_on_gpu():
 
 
 @pytest.fixture
-def nvlink_connected_output():
-    return mock.MagicMock(stdout="""GPU0	GPU1	GPU2	GPU3
+def all_nvlink_connected_output():
+    return mock.MagicMock(stdout="""        GPU0	GPU1	GPU2	GPU3
 GPU0	X	NV12	NV12	NV12
 GPU1	NV12	X	NV12	NV12
 GPU2	NV12	NV12	X	NV12
 GPU3	NV12	NV12	NV12	X""", returncode=0)
 
 
+@mock.patch("subprocess.run")
+def test_all_nvlink_connected(mock_run, all_nvlink_connected_output):
+    mock_run.return_value = all_nvlink_connected_output
+    with mock.patch("builtins.print") as mock_print:
+        check_nvlink_connectivity()
+        mock_print.assert_any_call("All GPUs are fully connected via NVLink.")
+
+
 @pytest.fixture
 def nvlink_partially_connected_output():
+    return mock.MagicMock(stdout="""        GPU0    GPU1    GPU2    GPU3    CPU Affinity
+GPU0     X      NV1     SYS     SYS     0-7
+GPU1    NV1      X      SYS     SYS     0-7
+GPU2    SYS     SYS      X      NV1     8-15
+GPU3    SYS     SYS     NV1      X      8-15
+
+Legend:
+  X   = Self
+  NV1 = Connected via NVLink with 1 hop
+  SYS = Connected via the PCIe or CPU subsystem""", returncode=0)
+
+
+@mock.patch("subprocess.run")
+def test_nvlink_partially_connected_output(mock_run, nvlink_partially_connected_output):
+    mock_run.return_value = nvlink_partially_connected_output
+    with mock.patch("builtins.print") as mock_print:
+        check_nvlink_connectivity()
+        mock_print.assert_any_call(
+            "Warning: Not all GPUs are fully connected via NVLink. Some GPUs are connected via slower interfaces. "
+            "It is recommended to switch to a different machine with faster GPU connections for optimal multi-GPU training performance."
+        )
+
+
+@pytest.fixture
+def nvlink_not_connected_output():
+    return mock.MagicMock(stdout="""        GPU0    GPU1    GPU2    GPU3    CPU Affinity    NUMA Affinity   GPU NUMA ID
+GPU0     X      PHB     PHB     PHB     0-47    0               N/A
+GPU1    PHB      X      PHB     PHB     0-47    0               N/A
+GPU2    PHB     PHB      X      PHB     0-47    0               N/A
+GPU3    PHB     PHB     PHB      X      0-47    0               N/A
+
+Legend:
+
+  X    = Self
+  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing at most a single PCIe bridge
+  NV#  = Connection traversing a bonded set of # NVLinks""", returncode=0)
+
+
+@mock.patch("subprocess.run")
+def test_nvlink_not_connected_output(mock_run, nvlink_not_connected_output):
+    mock_run.return_value = nvlink_not_connected_output
+    with mock.patch("builtins.print") as mock_print:
+        check_nvlink_connectivity()
+        mock_print.assert_any_call(
+            "Warning: Not all GPUs are fully connected via NVLink. Some GPUs are connected via slower interfaces. "
+            "It is recommended to switch to a different machine with faster GPU connections for optimal multi-GPU training performance."
+        )
+
+
+@pytest.fixture
+def nvlink_all_gpu_connected_but_other_connected_output():
     return mock.MagicMock(stdout="""	GPU0	GPU1	GPU2	GPU3	GPU4	GPU5	GPU6	GPU7	NIC0	NIC1	NIC2	NIC3	NIC4	NIC5	NIC6	NIC7	NIC8	NIC9	CPU Affinity	NUMA Affinity	GPU NUMA ID
 GPU0	X 	NV12	NV12	NV12	NV12	NV12	NV12	NV12	SYS	SYS	PXB	PXB	SYS	SYS	SYS	SYS	SYS	SYS	0-63,128-191	0		N/A
 GPU1	NV12	X 	NV12	NV12	NV12	NV12	NV12	NV12	SYS	SYS	PXB	PXB	SYS	SYS	SYS	SYS	SYS	SYS	0-63,128-191	0		N/A
@@ -514,19 +577,8 @@ def nvlink_partially_connected_output():
 
 
 @mock.patch("subprocess.run")
-def test_all_nvlink_connected(mock_run, nvlink_connected_output):
-    mock_run.return_value = nvlink_connected_output
+def test_nvlink_all_gpu_connected_but_other_connected_output(mock_run, nvlink_all_gpu_connected_but_other_connected_output):
+    mock_run.return_value = nvlink_all_gpu_connected_but_other_connected_output
     with mock.patch("builtins.print") as mock_print:
         check_nvlink_connectivity()
         mock_print.assert_any_call("All GPUs are fully connected via NVLink.")
-
-
-@mock.patch("subprocess.run")
-def test_not_all_nvlink_connected(mock_run, nvlink_partially_connected_output):
-    mock_run.return_value = nvlink_partially_connected_output
-    with mock.patch("builtins.print") as mock_print:
-        check_nvlink_connectivity()
-        mock_print.assert_any_call(
-            "Warning: Not all GPUs are fully connected via NVLink. Some GPUs are connected via slower interfaces. "
-            "It is recommended to switch to a different machine with faster GPU connections for optimal multi-GPU training performance."
-        )