Skip to content

Commit

Permalink
feat: add env var SET_NUM_PROCESSES_TO_NUM_GPUS (foundation-model-sta…
Browse files Browse the repository at this point in the history
…ck#110)

* add env var SET_NUM_PROCESSES_TO_NUM_GPUS

Signed-off-by: Anh-Uong <anh.uong@ibm.com>

* fix num processes type

Signed-off-by: Anh-Uong <anh.uong@ibm.com>

* move num_processes logic into build.utils

Signed-off-by: Anh-Uong <anh.uong@ibm.com>

* add unit tests for num_processes

Signed-off-by: Anh-Uong <anh.uong@ibm.com>

* refactor duplicate test case

Signed-off-by: Anh-Uong <anh.uong@ibm.com>

---------

Signed-off-by: Anh-Uong <anh.uong@ibm.com>
Co-authored-by: Sukriti Sharma <Ssukriti@users.noreply.github.com>
  • Loading branch information
2 people authored and jbusche committed Apr 9, 2024
1 parent 45624a3 commit 207fe3a
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 18 deletions.
1 change: 1 addition & 0 deletions build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ FROM registry.access.redhat.com/ubi9/ubi AS release
ARG CUDA_VERSION=11.8.0
ARG USER=tuning
ARG USER_UID=1000
ARG SET_NUM_PROCESSES_TO_NUM_GPUS=True

USER root

Expand Down
2 changes: 2 additions & 0 deletions build/accelerate_launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@

# Third Party
from accelerate.commands.launch import launch_command

# Local
from build.utils import process_accelerate_launch_args, get_job_config


Expand Down
26 changes: 25 additions & 1 deletion build/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import pickle

# Third Party
import torch
import transformers
from accelerate.commands.launch import launch_command_parser

Expand Down Expand Up @@ -126,6 +127,10 @@ def process_accelerate_launch_args(job_config_dict):
if accelerate_config:
logging.info("Using accelerate_launch_args configs: %s", accelerate_config)
for key, val in accelerate_config.items():
# skip num_processes to assign below based on SET_NUM_PROCESSES_TO_NUM_GPUS
if key == "num_processes":
continue

if actions_type_map.get(key) == "_AppendAction":
for param_val in val:
accelerate_launch_args.extend([f"--{key}", str(param_val)])
Expand All @@ -139,8 +144,27 @@ def process_accelerate_launch_args(job_config_dict):
if actions_type_map.get(key) == "_StoreAction":
accelerate_launch_args.append(str(val))

num_processes = accelerate_config.get("num_processes")
# accept setting SET_NUM_PROCESSES_TO_NUM_GPUS=True in Shell interpreted as string
set_num_processes_to_num_gpus = os.getenv(
"SET_NUM_PROCESSES_TO_NUM_GPUS", "True"
).lower()
user_arg_num_processes = accelerate_config.get("num_processes")
num_processes = 0
if set_num_processes_to_num_gpus == "true":
num_processes = torch.cuda.device_count()

if user_arg_num_processes:
logging.warning(
"SET_NUM_PROCESSES_TO_NUM_GPUS=True, overwriting user set num_processes %s\
to all GPUs available, %s.",
user_arg_num_processes,
num_processes,
)
elif user_arg_num_processes:
num_processes = int(user_arg_num_processes)

if num_processes:
accelerate_launch_args.extend(["--num_processes", str(num_processes)])
# if multi GPU setting and accelerate config_file not passed by user,
# use the default config for default set of parameters
if num_processes > 1 and not accelerate_config.get("config_file"):
Expand Down
2 changes: 0 additions & 2 deletions tests/build/dummy_job_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
"env": ["env1", "env2"],
"dynamo_use_dynamic": true,
"num_machines": 1,
"num_processes":2,
"main_process_port": 1234,
"fsdp_backward_prefetch_policy": "TRANSFORMER_BASED_WRAP",
"fsdp_sharding_strategy": 1,
Expand All @@ -13,7 +12,6 @@
"fsdp_sync_module_states": true,
"config_file": "fixtures/accelerate_fsdp_defaults.yaml"
},
"multi_gpu": true,
"model_name_or_path": "bigscience/bloom-560m",
"training_data_path": "data/twitter_complaints_small.json",
"output_dir": "bloom-twitter",
Expand Down
69 changes: 54 additions & 15 deletions tests/build/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,34 +82,73 @@ def test_process_launch_training_args_peft_method(job_config):


def test_process_accelerate_launch_args(job_config):
args = process_accelerate_launch_args(job_config)
# json config values used
assert args.use_fsdp is True
assert args.fsdp_backward_prefetch_policy == "TRANSFORMER_BASED_WRAP"
assert args.env == ["env1", "env2"]
assert args.training_script == "/app/launch_training.py"
assert args.config_file == "fixtures/accelerate_fsdp_defaults.yaml"

# default values
assert args.tpu_use_cluster is False
assert args.mixed_precision is None


@patch("torch.cuda.device_count", return_value=1)
def test_accelerate_launch_args_user_set_num_processes_ignored(job_config):
job_config_copy = copy.deepcopy(job_config)
job_config_copy["accelerate_launch_args"]["num_processes"] = "3"
args = process_accelerate_launch_args(job_config_copy)
# determine number of processes by number of GPUs available
assert args.num_processes == 1

# if single-gpu, CUDA_VISIBLE_DEVICES set
assert os.getenv("CUDA_VISIBLE_DEVICES") == "0"


@patch.dict(os.environ, {"SET_NUM_PROCESSES_TO_NUM_GPUS": "False"})
def test_accelerate_launch_args_user_set_num_processes(job_config):
job_config_copy = copy.deepcopy(job_config)
job_config_copy["accelerate_launch_args"]["num_processes"] = "3"

args = process_accelerate_launch_args(job_config_copy)
# json config values used
assert args.num_processes == 3
assert args.config_file == "fixtures/accelerate_fsdp_defaults.yaml"
assert args.use_fsdp == True
assert args.tpu_use_cluster == False


def test_accelerate_launch_args_default_fsdp_config_multigpu(job_config):
with patch("torch.cuda.device_count", return_value=2):
with patch("os.path.exists", return_value=True):
job_config_copy = copy.deepcopy(job_config)
job_config_copy["accelerate_launch_args"].pop("config_file")

assert "config_file" not in job_config_copy["accelerate_launch_args"]

args = process_accelerate_launch_args(job_config_copy)

# use default config file
assert args.config_file == "/app/accelerate_fsdp_defaults.yaml"
# determine number of processes by number of GPUs available
assert args.num_processes == 2


@patch("os.path.exists")
def test_process_accelerate_launch_custom_fsdp(patch_path_exists):
def test_process_accelerate_launch_custom_config_file(patch_path_exists):
patch_path_exists.return_value = True

dummy_fsdp_path = "dummy_fsdp_config.yaml"
dummy_config_path = "dummy_fsdp_config.yaml"

# When user passes custom fsdp config file, use custom config and accelerate
# launch will use `num_processes` from config
temp_job_config = {"accelerate_launch_args": {"config_file": dummy_fsdp_path}}
temp_job_config = {"accelerate_launch_args": {"config_file": dummy_config_path}}
args = process_accelerate_launch_args(temp_job_config)
assert args.config_file == dummy_fsdp_path
assert args.num_processes == None
assert args.config_file == dummy_config_path
assert args.num_processes is None

# When user passes custom fsdp config file and also `num_processes` as a param, use custom config and
# overwrite num_processes from config with param
temp_job_config = {
"accelerate_launch_args": {
"config_file": dummy_fsdp_path,
"num_processes": 3,
}
}
temp_job_config = {"accelerate_launch_args": {"config_file": dummy_config_path}}
args = process_accelerate_launch_args(temp_job_config)
assert args.config_file == dummy_fsdp_path
assert args.num_processes == 3
assert args.config_file == dummy_config_path

0 comments on commit 207fe3a

Please sign in to comment.