From 8e696dee8891ab8459a3f9be06933dc7a6224f89 Mon Sep 17 00:00:00 2001 From: Sander Land <48946947+sanderland@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:43:54 +0200 Subject: [PATCH 1/2] Update check_nvlink_connectivity --- litgpt/utils.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/litgpt/utils.py b/litgpt/utils.py index 006e67d2cd..74aaf9c26c 100644 --- a/litgpt/utils.py +++ b/litgpt/utils.py @@ -628,9 +628,7 @@ def check_nvlink_connectivity(fabric=None): gpu_regex = re.compile(r'^GPU\d+$') gpu_count = len([header for header in headers if gpu_regex.match(header)]) - for line in lines[start_index:]: - if not line.strip(): - break + for line in lines[start_index:start_index + gpu_count]: gpu_matrix.append(line.strip()) all_nvlink = True @@ -650,6 +648,3 @@ def check_nvlink_connectivity(fabric=None): except Exception as e: custom_print(f"An error occurred: {e}") - - except Exception as e: - custom_print(f"An error occurred: {e}") From 31eeffb753d19d5a8fd77616b026d70fc4e2cb1e Mon Sep 17 00:00:00 2001 From: rasbt Date: Wed, 21 Aug 2024 14:28:05 +0000 Subject: [PATCH 2/2] update code and unit tests --- litgpt/utils.py | 4 +-- tests/test_utils.py | 82 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 68 insertions(+), 18 deletions(-) diff --git a/litgpt/utils.py b/litgpt/utils.py index 74aaf9c26c..0a929e2a12 100644 --- a/litgpt/utils.py +++ b/litgpt/utils.py @@ -628,11 +628,9 @@ def check_nvlink_connectivity(fabric=None): gpu_regex = re.compile(r'^GPU\d+$') gpu_count = len([header for header in headers if gpu_regex.match(header)]) + all_nvlink = True for line in lines[start_index:start_index + gpu_count]: gpu_matrix.append(line.strip()) - - all_nvlink = True - for line in gpu_matrix: connections = line.split()[1:1 + gpu_count] if not all("NV" in conn for conn in connections if conn != "X"): all_nvlink = False diff --git a/tests/test_utils.py b/tests/test_utils.py index bf8a6d2793..d4f6be2076 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -457,16 +457,79 @@ def test_file_size_above_limit_on_gpu(): @pytest.fixture -def nvlink_connected_output(): - return mock.MagicMock(stdout="""GPU0 GPU1 GPU2 GPU3 +def all_nvlink_connected_output(): + return mock.MagicMock(stdout=""" GPU0 GPU1 GPU2 GPU3 GPU0 X NV12 NV12 NV12 GPU1 NV12 X NV12 NV12 GPU2 NV12 NV12 X NV12 GPU3 NV12 NV12 NV12 X""", returncode=0) +@mock.patch("subprocess.run") +def test_all_nvlink_connected(mock_run, all_nvlink_connected_output): + mock_run.return_value = all_nvlink_connected_output + with mock.patch("builtins.print") as mock_print: + check_nvlink_connectivity() + mock_print.assert_any_call("All GPUs are fully connected via NVLink.") + + @pytest.fixture def nvlink_partially_connected_output(): + return mock.MagicMock(stdout=""" GPU0 GPU1 GPU2 GPU3 CPU Affinity +GPU0 X NV1 SYS SYS 0-7 +GPU1 NV1 X SYS SYS 0-7 +GPU2 SYS SYS X NV1 8-15 +GPU3 SYS SYS NV1 X 8-15 + +Legend: + X = Self + NV1 = Connected via NVLink with 1 hop + SYS = Connected via the PCIe or CPU subsystem""", returncode=0) + + +@mock.patch("subprocess.run") +def test_nvlink_partially_connected_output(mock_run, nvlink_partially_connected_output): + mock_run.return_value = nvlink_partially_connected_output + with mock.patch("builtins.print") as mock_print: + check_nvlink_connectivity() + mock_print.assert_any_call( + "Warning: Not all GPUs are fully connected via NVLink. Some GPUs are connected via slower interfaces. " + "It is recommended to switch to a different machine with faster GPU connections for optimal multi-GPU training performance." + ) + + +@pytest.fixture +def nvlink_not_connected_output(): + return mock.MagicMock(stdout=""" GPU0 GPU1 GPU2 GPU3 CPU Affinity NUMA Affinity GPU NUMA ID +GPU0 X PHB PHB PHB 0-47 0 N/A +GPU1 PHB X PHB PHB 0-47 0 N/A +GPU2 PHB PHB X PHB 0-47 0 N/A +GPU3 PHB PHB PHB X 0-47 0 N/A + +Legend: + + X = Self + SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) + NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node + PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) + PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) + PIX = Connection traversing at most a single PCIe bridge + NV# = Connection traversing a bonded set of # NVLinks""", returncode=0) + + +@mock.patch("subprocess.run") +def test_nvlink_not_connected_output(mock_run, nvlink_not_connected_output): + mock_run.return_value = nvlink_not_connected_output + with mock.patch("builtins.print") as mock_print: + check_nvlink_connectivity() + mock_print.assert_any_call( + "Warning: Not all GPUs are fully connected via NVLink. Some GPUs are connected via slower interfaces. " + "It is recommended to switch to a different machine with faster GPU connections for optimal multi-GPU training performance." + ) + + +@pytest.fixture +def nvlink_all_gpu_connected_but_other_connected_output(): return mock.MagicMock(stdout=""" GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 NIC0 NIC1 NIC2 NIC3 NIC4 NIC5 NIC6 NIC7 NIC8 NIC9 CPU Affinity NUMA Affinity GPU NUMA ID GPU0 X NV12 NV12 NV12 NV12 NV12 NV12 NV12 SYS SYS PXB PXB SYS SYS SYS SYS SYS SYS 0-63,128-191 0 N/A GPU1 NV12 X NV12 NV12 NV12 NV12 NV12 NV12 SYS SYS PXB PXB SYS SYS SYS SYS SYS SYS 0-63,128-191 0 N/A @@ -514,19 +577,8 @@ def nvlink_partially_connected_output(): @mock.patch("subprocess.run") -def test_all_nvlink_connected(mock_run, nvlink_connected_output): - mock_run.return_value = nvlink_connected_output +def test_nvlink_all_gpu_connected_but_other_connected_output(mock_run, nvlink_all_gpu_connected_but_other_connected_output): + mock_run.return_value = nvlink_all_gpu_connected_but_other_connected_output with mock.patch("builtins.print") as mock_print: check_nvlink_connectivity() mock_print.assert_any_call("All GPUs are fully connected via NVLink.") - - -@mock.patch("subprocess.run") -def test_not_all_nvlink_connected(mock_run, nvlink_partially_connected_output): - mock_run.return_value = nvlink_partially_connected_output - with mock.patch("builtins.print") as mock_print: - check_nvlink_connectivity() - mock_print.assert_any_call( - "Warning: Not all GPUs are fully connected via NVLink. Some GPUs are connected via slower interfaces. " - "It is recommended to switch to a different machine with faster GPU connections for optimal multi-GPU training performance." - )