From 6acfd3bb51d36d7ba352e0615a3d7d40501f618f Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Wed, 16 Oct 2024 18:37:09 +0800 Subject: [PATCH 1/3] feat(pt): support CPU parallel training with PT --- deepmd/pt/entrypoints/main.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index a0694c41c5..6db59c6c27 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -105,8 +105,16 @@ def get_trainer( local_rank = os.environ.get("LOCAL_RANK") if local_rank is not None: local_rank = int(local_rank) - assert dist.is_nccl_available() - dist.init_process_group(backend="nccl") + nccl_available = dist.is_nccl_available() + gloo_available = dist.is_gloo_available() + # nccl first + if nccl_available: + backend = "nccl" + elif gloo_available: + backend = "gloo" + else: + raise RuntimeError("No suitable backend found. Neither NCCL nor Gloo is available.") + dist.init_process_group(backend=backend) def prepare_trainer_input_single( model_params_single, data_dict_single, rank=0, seed=None From 0b0d943961c975030ac2e0b566a2ffd6afc6951a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 16 Oct 2024 10:38:04 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt/entrypoints/main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index 6db59c6c27..8b558040ee 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -113,7 +113,9 @@ def get_trainer( elif gloo_available: backend = "gloo" else: - raise RuntimeError("No suitable backend found. Neither NCCL nor Gloo is available.") + raise RuntimeError( + "No suitable backend found. Neither NCCL nor Gloo is available." + ) dist.init_process_group(backend=backend) def prepare_trainer_input_single( From f9b43e8fb630d0cae240ccd964bf381fbf5f096e Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Wed, 23 Oct 2024 12:47:23 +0800 Subject: [PATCH 3/3] Update main.py --- deepmd/pt/entrypoints/main.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index 8b558040ee..47946d3037 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -105,18 +105,7 @@ def get_trainer( local_rank = os.environ.get("LOCAL_RANK") if local_rank is not None: local_rank = int(local_rank) - nccl_available = dist.is_nccl_available() - gloo_available = dist.is_gloo_available() - # nccl first - if nccl_available: - backend = "nccl" - elif gloo_available: - backend = "gloo" - else: - raise RuntimeError( - "No suitable backend found. Neither NCCL nor Gloo is available." - ) - dist.init_process_group(backend=backend) + dist.init_process_group(backend="cuda:nccl,cpu:gloo") def prepare_trainer_input_single( model_params_single, data_dict_single, rank=0, seed=None