huggingface · muellerzr · Jul 3, 2024 · Jun 3, 2024 · Jun 3, 2024 · Jun 13, 2024
diff --git a/examples/README.md b/examples/README.md
@@ -233,6 +233,8 @@ In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) the only parameter in
 
 In [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we must specify the number of nodes that will be part of the training (`--num_machines`), how many GPUs we will use in total (`--num_processes`), the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend), `--main_process_ip` which will be the address the master node and the `--main_process_port`.
 
+In [/slurm/submit_multicpu.sh](./slurm/submit_multicpu.sh) we must specify the number of nodes that will be part of the training (`--num_machines`), how many CPU processes we will use in total (`--num_processes`), the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend), `--main_process_ip` which will be the address the master node and the `--main_process_port`. `mpirun_hostfile` specifies to run the job using MPIRun.
+
 In both scripts, we run `activateEnviroment.sh` at the beginning. This script should contain the necessary instructions to initialize the environment for execution. Below, we show an example that loads the necessary libraries ([Environment modules](https://github.com/cea-hpc/modules)), activates the Python environment, and sets up various environment variables, most of them to run the scripts in offline mode in case we don't have internet connection from the cluster.
 
 ```bash

diff --git a/examples/slurm/submit_multicpu.sh b/examples/slurm/submit_multicpu.sh
@@ -0,0 +1,65 @@
+#!/bin/bash -l
+
+#SBATCH --job-name=multicpu
+#SBATCH --nodes=2                       # number of Nodes
+#SBATCH --ntasks-per-node=1             # number of MP tasks
+#SBATCH --exclusive
+#SBATCH --output=O-%x.%j
+#SBATCH --error=E-%x.%j
+
+######################
+### Set enviroment ###
+######################
+source activateEnvironment.sh
+
+######################
+#### Set network #####
+######################
+head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+######################
+
+# Setup env variables for distributed jobs
+export MASTER_PORT="${MASTER_PORT:-29555 }"
+echo "head_node_ip=${head_node_ip}"
+echo "MASTER_PORT=${MASTER_PORT}"
+
+INSTANCES_PER_NODE="${INSTANCES_PER_NODE:-1}"
+
+if [[ $SLURM_NNODES == 1 ]] && [[ $INSTANCES_PER_NODE == 1 ]]; then
+  export CCL_WORKER_COUNT=0
+  LAUNCHER=""
+else
+  # Setup env variables for distributed jobs
+  export CCL_WORKER_COUNT="${CCL_WORKER_COUNT:-2}"  
+  echo "CCL_WORKER_COUNT=${CCL_WORKER_COUNT}"
+
+  # Write hostfile
+  HOSTFILE_PATH=hostfile
+  scontrol show hostname $SLURM_JOB_NODELIST | perl -ne 'chomb; print "$_"x1'> ${HOSTFILE_PATH}
+
+  export LAUNCHER="accelerate launch \
+    --num_processes $((SLURM_NNODES * ${INSTANCES_PER_NODE})) \
+    --num_machines $SLURM_NNODES \
+    --rdzv_backend c10d \
+    --main_process_ip $head_node_ip \
+    --main_process_port $MASTER_PORT \
+    --mpirun_hostfile $HOSTFILE_PATH \
+    --mpirun_ccl $CCL_WORKER_COUNT"
+fi
+
+# This step is necessary because accelerate launch does not handle multiline arguments properly
+export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"
+export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
+export SCRIPT_ARGS=" \
+    --cpu \
+    --output_dir ${ACCELERATE_DIR}/examples/output \
+    "
+
+# This step is necessary because accelerate launch does not handle multiline arguments properly
+export CMD="$LAUNCHER $SCRIPT $SCRIPT_ARGS" 
+# Print the command
+echo $CMD
+echo ""
+
+# Run the command
+eval $CMD
diff --git a/examples/slurm/submit_multigpu.sh b/examples/slurm/submit_multigpu.sh
@@ -13,14 +13,15 @@
 ######################
 ### Set enviroment ###
 ######################
-source activateEnviroment.sh
+source activateEnvironment.sh
 export GPUS_PER_NODE=4
 ######################
 
-export SCRIPT=/accelerate/examples/complete_nlp_example.py
+export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"
+export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
 export SCRIPT_ARGS=" \
     --mixed_precision fp16 \
-    --output_dir /accelerate/examples/output \
+    --output_dir ${ACCELERATE_DIR}/examples/output \
     --with_tracking \
     "
 

diff --git a/examples/slurm/submit_multinode.sh b/examples/slurm/submit_multinode.sh
@@ -13,7 +13,7 @@
 ######################
 ### Set enviroment ###
 ######################
-source activateEnviroment.sh
+source activateEnvironment.sh
 export GPUS_PER_NODE=4
 ######################
 
@@ -30,10 +30,11 @@ export LAUNCHER="accelerate launch \
     --main_process_ip $head_node_ip \
     --main_process_port 29500 \
     "
-export SCRIPT="/accelerate/examples/complete_nlp_example.py"
+export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}"
+export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py"
 export SCRIPT_ARGS=" \
     --mixed_precision fp16 \
-    --output_dir /accelerate/examples/output \
+    --output_dir ${ACCELERATE_DIR}/examples/output \
     "
 
 # This step is necessary because accelerate launch does not handle multiline arguments properly

diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py
@@ -67,10 +67,10 @@ def _get_mpirun_args():
     mpirun_version = subprocess.check_output([mpi_app, "--version"])
 
     if b"Open MPI" in mpirun_version:
-        return mpi_app, "--hostfile", "-n", "--npernode"
+        return mpi_app, "--hostfile", "-n", "--npernode", "--bind-to"
     else:
         # Intel MPI and MVAPICH both use the same arg names
-        return mpi_app, "-f", "-n", "-ppn"
+        return mpi_app, "-f", "-n", "-ppn", ""
 
 
 def prepare_simple_launcher_cmd_env(args: argparse.Namespace) -> Tuple[List[str], Dict[str, str]]:
@@ -82,14 +82,23 @@ def prepare_simple_launcher_cmd_env(args: argparse.Namespace) -> Tuple[List[str]
         raise ValueError("--module and --no_python cannot be used together")
 
     if args.mpirun_hostfile is not None:
-        mpi_app_name, hostfile_arg, num_proc_arg, proc_per_node_arg = _get_mpirun_args()
+        mpi_app_name, hostfile_arg, num_proc_arg, proc_per_node_arg, bind_to_arg = _get_mpirun_args()
         mpirun_ccl = getattr(args, "mpirun_ccl", None)
+        bind_to = getattr(args, "bind-to", "socket")
         num_machines = args.num_machines
         num_processes = getattr(args, "num_processes", None)
         nproc_per_node = str(num_processes // num_machines) if num_processes and num_machines else "1"
-        cmd += [mpi_app_name, hostfile_arg, args.mpirun_hostfile, proc_per_node_arg, nproc_per_node]
+        cmd += [
+            mpi_app_name,
+            hostfile_arg,
+            args.mpirun_hostfile,
+            proc_per_node_arg,
+            nproc_per_node,
+        ]
         if num_processes:
             cmd += [num_proc_arg, str(num_processes)]
+        if bind_to_arg:
+            cmd += [bind_to_arg, bind_to]
     if not args.no_python:
         cmd.append(sys.executable)
         if args.module:
@@ -115,7 +124,7 @@ def prepare_simple_launcher_cmd_env(args: argparse.Namespace) -> Tuple[List[str]
         current_env["MASTER_PORT"] = str(args.main_process_port)
 
         if args.mpirun_hostfile is not None:
-            current_env["CCL_WORKER_COUNT"] = mpirun_ccl
+            current_env["CCL_WORKER_COUNT"] = str(mpirun_ccl)
     elif args.num_processes > 1:
         current_env["MASTER_ADDR"] = args.main_process_ip if args.main_process_ip is not None else "127.0.0.1"
         current_env["MASTER_PORT"] = str(args.main_process_port) if args.main_process_port is not None else "29500"
@@ -250,7 +259,6 @@ def prepare_multi_gpu_env(args: argparse.Namespace) -> Dict[str, str]:
         current_env["FSDP_USE_ORIG_PARAMS"] = str(args.fsdp_use_orig_params).lower()
         current_env["FSDP_CPU_RAM_EFFICIENT_LOADING"] = str(args.fsdp_cpu_ram_efficient_loading).lower()
         current_env["FSDP_SYNC_MODULE_STATES"] = str(args.fsdp_sync_module_states).lower()
-        current_env["FSDP_ACTIVATION_CHECKPOINTING"] = str(args.fsdp_activation_checkpointing).lower()
 
     if args.use_megatron_lm:
         prefix = "MEGATRON_LM_"