Skip to content

Commit

Permalink
Merge pull request easybuilders#19231 from Flamefire/20231114131104_n…
Browse files Browse the repository at this point in the history
…ew_pr_NCCL2103

fix possible error/crash in NCCL on x86 due to cpuid
  • Loading branch information
smoors authored Nov 16, 2023
2 parents a2c174a + 11ad87f commit 4adb8d1
Show file tree
Hide file tree
Showing 10 changed files with 93 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@ toolchain = {'name': 'GCCcore', 'version': '10.3.0'}
github_account = 'NVIDIA'
source_urls = [GITHUB_SOURCE]
sources = ['v%(version)s-1.tar.gz']
checksums = ['55de166eb7dcab9ecef2629cdb5fb0c5ebec4fae03589c469ebe5dcb5716b3c5']
patches = ['NCCL-2.16.2_fix-cpuid.patch']
checksums = [
{'v2.10.3-1.tar.gz': '55de166eb7dcab9ecef2629cdb5fb0c5ebec4fae03589c469ebe5dcb5716b3c5'},
{'NCCL-2.16.2_fix-cpuid.patch': '0459ecadcd32b2a7a000a2ce4f675afba908b2c0afabafde585330ff4f83e277'},
]

builddependencies = [('binutils', '2.36.1')]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@ toolchain = {'name': 'GCCcore', 'version': '11.2.0'}
github_account = 'NVIDIA'
source_urls = [GITHUB_SOURCE]
sources = ['v%(version)s-1.tar.gz']
checksums = ['55de166eb7dcab9ecef2629cdb5fb0c5ebec4fae03589c469ebe5dcb5716b3c5']
patches = ['NCCL-2.16.2_fix-cpuid.patch']
checksums = [
{'v2.10.3-1.tar.gz': '55de166eb7dcab9ecef2629cdb5fb0c5ebec4fae03589c469ebe5dcb5716b3c5'},
{'NCCL-2.16.2_fix-cpuid.patch': '0459ecadcd32b2a7a000a2ce4f675afba908b2c0afabafde585330ff4f83e277'},
]

builddependencies = [('binutils', '2.37')]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@ toolchain = {'name': 'GCCcore', 'version': '11.2.0'}
github_account = 'NVIDIA'
source_urls = [GITHUB_SOURCE]
sources = ['v%(version)s-1.tar.gz']
checksums = ['55de166eb7dcab9ecef2629cdb5fb0c5ebec4fae03589c469ebe5dcb5716b3c5']
patches = ['NCCL-2.16.2_fix-cpuid.patch']
checksums = [
{'v2.10.3-1.tar.gz': '55de166eb7dcab9ecef2629cdb5fb0c5ebec4fae03589c469ebe5dcb5716b3c5'},
{'NCCL-2.16.2_fix-cpuid.patch': '0459ecadcd32b2a7a000a2ce4f675afba908b2c0afabafde585330ff4f83e277'},
]

builddependencies = [('binutils', '2.37')]

Expand Down
6 changes: 5 additions & 1 deletion easybuild/easyconfigs/n/NCCL/NCCL-2.11.4-gcccuda-2019b.eb
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ toolchain = {'name': 'gcccuda', 'version': '2019b'}
github_account = 'NVIDIA'
source_urls = [GITHUB_SOURCE]
sources = ['v%(version)s-1.tar.gz']
checksums = ['db4e9a0277a64f9a31ea9b5eea22e63f10faaed36dded4587bbc8a0d8eceed10']
patches = ['NCCL-2.16.2_fix-cpuid.patch']
checksums = [
{'v2.11.4-1.tar.gz': 'db4e9a0277a64f9a31ea9b5eea22e63f10faaed36dded4587bbc8a0d8eceed10'},
{'NCCL-2.16.2_fix-cpuid.patch': '0459ecadcd32b2a7a000a2ce4f675afba908b2c0afabafde585330ff4f83e277'},
]

moduleclass = 'lib'
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@ toolchain = {'name': 'GCCcore', 'version': '11.3.0'}
github_account = 'NVIDIA'
source_urls = [GITHUB_SOURCE]
sources = ['v%(version)s-1.tar.gz']
checksums = ['49b4fbfeebf1f62f6ceb69e72504045d8d1b4e7609e3c2477906f3004c7e2d82']
patches = ['NCCL-2.16.2_fix-cpuid.patch']
checksums = [
{'v2.12.12-1.tar.gz': '49b4fbfeebf1f62f6ceb69e72504045d8d1b4e7609e3c2477906f3004c7e2d82'},
{'NCCL-2.16.2_fix-cpuid.patch': '0459ecadcd32b2a7a000a2ce4f675afba908b2c0afabafde585330ff4f83e277'},
]

builddependencies = [('binutils', '2.38')]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@ toolchain = {'name': 'GCCcore', 'version': '12.2.0'}
github_account = 'NVIDIA'
source_urls = [GITHUB_SOURCE]
sources = ['v%(version)s-1.tar.gz']
checksums = ['7f7c738511a8876403fc574d13d48e7c250d934d755598d82e14bab12236fc64']
patches = ['NCCL-2.16.2_fix-cpuid.patch']
checksums = [
{'v2.16.2-1.tar.gz': '7f7c738511a8876403fc574d13d48e7c250d934d755598d82e14bab12236fc64'},
{'NCCL-2.16.2_fix-cpuid.patch': '0459ecadcd32b2a7a000a2ce4f675afba908b2c0afabafde585330ff4f83e277'},
]

builddependencies = [('binutils', '2.39')]

Expand Down
43 changes: 43 additions & 0 deletions easybuild/easyconfigs/n/NCCL/NCCL-2.16.2_fix-cpuid.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
The 2nd CPUID asm code modifies registers used by other variables due to failure to list
EBX, ECX & EDX in the "clobbers" list.
This causes corruption leading to segfaults or wrong results depending on compiler optimization/register allocation.

Fix by using the __cpuid GCC function.
See https://github.com/NVIDIA/nccl/pull/1070

Author: Alexander Grund (TU Dresden)

diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index 316d20f..d0d1272 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -12,6 +12,9 @@
#include "core.h"
#include "nvmlwrap.h"
#include "xml.h"
+#if defined(__x86_64__)
+#include <cpuid.h>
+#endif

/*******************/
/* XML File Parser */
@@ -408,7 +411,8 @@ ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml*
char vendor[12];
} cpuid0;

- asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0) : "memory");
+ unsigned unused;
+ __cpuid(0, unused, cpuid0.ebx, cpuid0.ecx, cpuid0.edx);
char vendor[13];
strncpy(vendor, cpuid0.vendor, 12);
vendor[12] = '\0';
@@ -430,7 +434,8 @@ ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml*
};
uint32_t val;
} cpuid1;
- asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1) : "memory");
+ unsigned unused;
+ __cpuid(1, cpuid1.val, unused, unused, unused);
int familyId = cpuid1.familyId + (cpuid1.extFamilyId << 4);
int modelId = cpuid1.modelId + (cpuid1.extModelId << 4);
NCCLCHECK(xmlSetAttrInt(cpuNode, "familyid", familyId));
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,12 @@ toolchain = {'name': 'GCCcore', 'version': '12.3.0'}
github_account = 'NVIDIA'
source_urls = [GITHUB_SOURCE]
sources = ['v%(version)s-1.tar.gz']
checksums = [('6477d83c9edbb34a0ebce6d751a1b32962bc6415d75d04972b676c6894ceaef9',
'b4f5d7d9eea2c12e32e7a06fe138b2cfc75969c6d5c473aa6f819a792db2fc96')]
patches = ['NCCL-2.16.2_fix-cpuid.patch']
checksums = [
('6477d83c9edbb34a0ebce6d751a1b32962bc6415d75d04972b676c6894ceaef9',
'b4f5d7d9eea2c12e32e7a06fe138b2cfc75969c6d5c473aa6f819a792db2fc96'),
{'NCCL-2.16.2_fix-cpuid.patch': '0459ecadcd32b2a7a000a2ce4f675afba908b2c0afabafde585330ff4f83e277'},
]

builddependencies = [('binutils', '2.40')]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,14 @@ dependencies = [('CUDAcore', local_cuda_version, '', SYSTEM)]
github_account = 'NVIDIA'
source_urls = [GITHUB_SOURCE]
sources = ['v%(version)s-1.tar.gz']
patches = ['NCCL-2.8.3_fix-isend-irecv.patch']
patches = [
'NCCL-2.8.3_fix-isend-irecv.patch',
'NCCL-2.16.2_fix-cpuid.patch',
]
checksums = [
'3ae89ddb2956fff081e406a94ff54ae5e52359f5d645ce977c7eba09b3b782e6', # v2.8.3-1.tar.gz
'04d61ea9b9f0954bed05494017649a68950b6b5e5851d969244f9ab67d5ecc92', # NCCL-2.8.3_fix-isend-irecv.patch
{'v2.8.3-1.tar.gz': '3ae89ddb2956fff081e406a94ff54ae5e52359f5d645ce977c7eba09b3b782e6'},
{'NCCL-2.8.3_fix-isend-irecv.patch': '04d61ea9b9f0954bed05494017649a68950b6b5e5851d969244f9ab67d5ecc92'},
{'NCCL-2.16.2_fix-cpuid.patch': '0459ecadcd32b2a7a000a2ce4f675afba908b2c0afabafde585330ff4f83e277'},
]

moduleclass = 'lib'
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,14 @@ dependencies = [('CUDAcore', local_cuda_version, '', SYSTEM)]
github_account = 'NVIDIA'
source_urls = [GITHUB_SOURCE]
sources = ['v%(version)s-1.tar.gz']
patches = ['NCCL-2.8.3_fix-isend-irecv.patch']
patches = [
'NCCL-2.8.3_fix-isend-irecv.patch',
'NCCL-2.16.2_fix-cpuid.patch',
]
checksums = [
'3ae89ddb2956fff081e406a94ff54ae5e52359f5d645ce977c7eba09b3b782e6', # v2.8.3-1.tar.gz
'04d61ea9b9f0954bed05494017649a68950b6b5e5851d969244f9ab67d5ecc92', # NCCL-2.8.3_fix-isend-irecv.patch
{'v2.8.3-1.tar.gz': '3ae89ddb2956fff081e406a94ff54ae5e52359f5d645ce977c7eba09b3b782e6'},
{'NCCL-2.8.3_fix-isend-irecv.patch': '04d61ea9b9f0954bed05494017649a68950b6b5e5851d969244f9ab67d5ecc92'},
{'NCCL-2.16.2_fix-cpuid.patch': '0459ecadcd32b2a7a000a2ce4f675afba908b2c0afabafde585330ff4f83e277'},
]

moduleclass = 'lib'

0 comments on commit 4adb8d1

Please sign in to comment.