Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add multinode support via slurm trainer, large scale race condition fix #63

Merged
merged 4 commits into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions multinode_trainer.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash
lessw2020 marked this conversation as resolved.
Show resolved Hide resolved

#SBATCH --job-name=torchtrain_multi_node
lessw2020 marked this conversation as resolved.
Show resolved Hide resolved

#SBATCH --ntasks=2

#SBATCH --nodes=2

#SBATCH --gpus-per-task=8

#SBATCH --cpus-per-task=96

#SBATCH --partition=train


nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
nodes_array=($nodes)
head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)

echo Node IP: $head_node_ip
export LOGLEVEL=INFO
# Enable for A100
export FI_PROVIDER="efa"
# Ensure that P2P is available
# export NCCL_P2P_DISABLE=1
export NCCL_IB_DISABLE=1

# debugging flags (optional)
export NCCL_DEBUG=WARN
export PYTHONFAULTHANDLER=1
# optional debug settings
# export NCCL_DEBUG=INFO
# NCCL_DEBUG_SUBSYS=INIT,GRAPH,ENV

export LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
export CUDA_LAUNCH_BLOCKING=0

# on your cluster you might need these:
# set the network interface
export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"
export NCCL_BUFFSIZE=2097152
#export TORCH_DIST_INIT_BARRIER=1
export FI_EFA_SET_CUDA_SYNC_MEMOPS=0

dcgmi profile --pause
# adjust sbatch --ntasks and sbatch --nodes above and --nnodes below
# to your specific node count, and update target launch file.
srun torchrun --nnodes 2 --nproc_per_node 8 --rdzv_id 101 --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:29500" ./train.py --steps 10
dcgmi profile --resume
7 changes: 4 additions & 3 deletions torchtrain/profiling.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.

import contextlib
import os

import torch

try:
Expand Down Expand Up @@ -47,15 +48,15 @@ def trace_handler(prof):
curr_trace_dir_name = "iteration_" + str(_global_iter_count)
curr_trace_dir = os.path.join(trace_dir, curr_trace_dir_name)
if not os.path.exists(curr_trace_dir):
os.makedirs(curr_trace_dir)
os.makedirs(curr_trace_dir, exist_ok=True)
rank0_log(f"exporting profile traces to {curr_trace_dir}")

prof.export_chrome_trace(f"{curr_trace_dir}/rank{rank}_trace.json")

rank0_log(f"Profiling active. Traces will be saved at {trace_dir}")

if not os.path.exists(trace_dir):
os.makedirs(trace_dir)
os.makedirs(trace_dir, exist_ok=True)

with torch.profiler.profile(
activities=[
Expand Down
Loading