Merge pull request #632 from gordicaleksa/multi_node_my

MPI/TCP/FS for NCCL-init
karpathy · Jun 24, 2024 · 69b50ad · 69b50ad
2 parents 2a4be7f + 4af6a6a
commit 69b50ad
Show file tree

Hide file tree

Showing 9 changed files with 683 additions and 72 deletions.
diff --git a/Makefile b/Makefile
@@ -188,27 +188,37 @@ else
   endif
 endif
 
-# Check if OpenMPI and NCCL are available, include them if so, for multi-GPU training
+# Check if NCCL is available, include if so, for multi-GPU training
 ifeq ($(NO_MULTI_GPU), 1)
-  $(info → Multi-GPU (OpenMPI + NCCL) is manually disabled)
+  $(info → Multi-GPU (NCCL) is manually disabled)
 else
   ifneq ($(OS), Windows_NT)
     # Detect if running on macOS or Linux
     ifeq ($(SHELL_UNAME), Darwin)
-      $(info ✗ Multi-GPU on CUDA on Darwin is not supported, skipping OpenMPI + NCCL support)
-    else ifeq ($(shell [ -d /usr/lib/x86_64-linux-gnu/openmpi/lib/ ] && [ -d /usr/lib/x86_64-linux-gnu/openmpi/include/ ] && echo "exists"), exists)
-      $(info ✓ OpenMPI found, OK to train with multiple GPUs)
-      NVCC_INCLUDES += -I/usr/lib/x86_64-linux-gnu/openmpi/include
-      NVCC_LDFLAGS += -L/usr/lib/x86_64-linux-gnu/openmpi/lib/
-      NVCC_LDLIBS += -lmpi -lnccl
-      NVCC_FLAGS += -DMULTI_GPU
+      $(info ✗ Multi-GPU on CUDA on Darwin is not supported, skipping NCCL support)
+    else ifeq ($(shell dpkg -l | grep -q nccl && echo "exists"), exists)
+      $(info ✓ NCCL found, OK to train with multiple GPUs)
+      NVCC_LDLIBS += -lnccl
     else
-      $(info ✗ OpenMPI is not found, disabling multi-GPU support)
-      $(info ---> On Linux you can try install OpenMPI with `sudo apt install openmpi-bin openmpi-doc libopenmpi-dev`)
+      $(info ✗ NCCL is not found, disabling multi-GPU support)
+      $(info ---> On Linux you can try install NCCL with `sudo apt install libnccl2 libnccl-dev`)
     endif
   endif
 endif
 
+ifeq ($(NO_USE_MPI), 1)
+  $(info → MPI is manually disabled)
+else ifeq ($(shell [ -d /usr/lib/x86_64-linux-gnu/openmpi/lib/ ] && [ -d /usr/lib/x86_64-linux-gnu/openmpi/include/ ] && echo "exists"), exists)
+  $(info ✓ MPI enabled)
+  NVCC_INCLUDES += -I/usr/lib/x86_64-linux-gnu/openmpi/include
+  NVCC_LDFLAGS += -L/usr/lib/x86_64-linux-gnu/openmpi/lib/
+  NVCC_LDLIBS += -lmpi
+  NVCC_FLAGS += -DUSE_MPI
+  NVCC_FLAGS += -DMULTI_GPU
+else
+  $(info ✗ MPI not found)
+endif
+
 # Precision settings, default to bf16 but ability to override
 PRECISION ?= BF16
 VALID_PRECISIONS := FP32 FP16 BF16

diff --git a/README.md b/README.md
@@ -134,7 +134,9 @@ sudo apt-get -y install libcudnn9-dev-cuda-12
 
 On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. Simply clone the repo to your disk. The Makefile currently looks for it in either your home directory or the current directory. If you have put it elsewhere, add `CUDNN_FRONTEND_PATH=/path/to/your/cudnn-frontend/include` to the `make` command-line.
 
-**multi-GPU training using MPI and NCCL**. Make sure you install MPI and NCCL, e.g. on Linux:
+## multi-GPU training
+
+Make sure you install MPI and NCCL, e.g. on Linux:
 
 ```bash
 sudo apt install openmpi-bin openmpi-doc libopenmpi-dev
@@ -149,6 +151,23 @@ make train_gpt2cu
 mpirun -np <number of GPUs> ./train_gpt2cu
 ```
 
+or simply run one of our scripts under `./scripts/`.
+
+## multi-node training
+
+Make sure you've installed `NCCL` following instructions from [multi-GPU](#multi-gpu-training) section.
+
+There are 3 ways we currently support that allow you to run multi-node training:
+1) Use OpenMPI to exchange nccl id and initialize NCCL. See e.g. `./scripts/multi_node/run_gpt2_124M_mpi.sh` script for details.
+2) Use shared file system to init NCCL. See `./scripts/multi_node/run_gpt2_124M_fs.sbatch` script for details.
+3) Use TCP sockets to init NCCL. See `./scripts/multi_node/run_gpt2_124M_tcp.sbatch` script for details.
+
+Note:
+* If you're running in a slurm environment and your slurm doesn't support PMIx (which we assume will be a common situation given that `slurm-wlm` dropped PMIx support) you will have to use FS (2) or TCP (3) approach. To test whether your slurm supports PMIx run: `srun --mpi=list` and see whether you get `pmix` in the output.
+* If you don't have slurm set up, you can kick off a multi-node run using `mpirun` - MPI (1).
+
+None of these 3 methods is superior, we just offer you options so that you can run in your specific environment.
+
 ## experiments / sweeps
 
 Just as an example process to sweep learning rates on a machine with 4 GPUs on TinyStories. Run a shell script `sweep.sh` (after you of course `chmod u+x sweep.sh`):