-
Notifications
You must be signed in to change notification settings - Fork 953
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added a MultiCPU SLURM example using Accelerate Launch and MPIRun (#2902
) * initial commit for slurm multicpu script * changed output path * Added multicpu example using accelerate + mpirun + slurm * removed file * rename file * deleted file * refactored for cleanliness * updated docs * fixed variable names * quality update * test fix * addressed review comments * fix typo for activateEnvironment.sh * added ACCELERATE path * Edit wording Co-authored-by: Dina Suehiro Jones <dina.s.jones@intel.com> * added back mistakenly deleted line --------- Co-authored-by: Dina Suehiro Jones <dina.s.jones@intel.com>
- Loading branch information
1 parent
57a4c74
commit 6882ff2
Showing
5 changed files
with
89 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#!/bin/bash -l | ||
|
||
#SBATCH --job-name=multicpu | ||
#SBATCH --nodes=2 # number of Nodes | ||
#SBATCH --ntasks-per-node=1 # number of MP tasks | ||
#SBATCH --exclusive | ||
#SBATCH --output=O-%x.%j | ||
#SBATCH --error=E-%x.%j | ||
|
||
###################### | ||
### Set enviroment ### | ||
###################### | ||
source activateEnvironment.sh | ||
|
||
###################### | ||
#### Set network ##### | ||
###################### | ||
head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) | ||
###################### | ||
|
||
# Setup env variables for distributed jobs | ||
export MASTER_PORT="${MASTER_PORT:-29555 }" | ||
echo "head_node_ip=${head_node_ip}" | ||
echo "MASTER_PORT=${MASTER_PORT}" | ||
|
||
INSTANCES_PER_NODE="${INSTANCES_PER_NODE:-1}" | ||
|
||
if [[ $SLURM_NNODES == 1 ]] && [[ $INSTANCES_PER_NODE == 1 ]]; then | ||
export CCL_WORKER_COUNT=0 | ||
LAUNCHER="" | ||
else | ||
# Setup env variables for distributed jobs | ||
export CCL_WORKER_COUNT="${CCL_WORKER_COUNT:-2}" | ||
echo "CCL_WORKER_COUNT=${CCL_WORKER_COUNT}" | ||
|
||
# Write hostfile | ||
HOSTFILE_PATH=hostfile | ||
scontrol show hostname $SLURM_JOB_NODELIST | perl -ne 'chomb; print "$_"x1'> ${HOSTFILE_PATH} | ||
|
||
export LAUNCHER="accelerate launch \ | ||
--num_processes $((SLURM_NNODES * ${INSTANCES_PER_NODE})) \ | ||
--num_machines $SLURM_NNODES \ | ||
--rdzv_backend c10d \ | ||
--main_process_ip $head_node_ip \ | ||
--main_process_port $MASTER_PORT \ | ||
--mpirun_hostfile $HOSTFILE_PATH \ | ||
--mpirun_ccl $CCL_WORKER_COUNT" | ||
fi | ||
|
||
# This step is necessary because accelerate launch does not handle multiline arguments properly | ||
export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}" | ||
export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py" | ||
export SCRIPT_ARGS=" \ | ||
--cpu \ | ||
--output_dir ${ACCELERATE_DIR}/examples/output \ | ||
" | ||
|
||
# This step is necessary because accelerate launch does not handle multiline arguments properly | ||
export CMD="$LAUNCHER $SCRIPT $SCRIPT_ARGS" | ||
# Print the command | ||
echo $CMD | ||
echo "" | ||
|
||
# Run the command | ||
eval $CMD |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters