diff --git a/CHANGELOG.md b/CHANGELOG.md index 304b907cca0..4d068bd23a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ All notable changes to this project will be documented in this file. ### New features -- +- Action task supports multi GPU training. () ### Enhancements diff --git a/otx/algorithms/action/adapters/mmaction/task.py b/otx/algorithms/action/adapters/mmaction/task.py index a4815b99113..c889224545c 100644 --- a/otx/algorithms/action/adapters/mmaction/task.py +++ b/otx/algorithms/action/adapters/mmaction/task.py @@ -158,9 +158,10 @@ def configure( recipe_cfg.work_dir = self._output_path recipe_cfg.resume = self._resume - recipe_cfg.distributed = False recipe_cfg.omnisource = False + self._configure_device(recipe_cfg, training) + if data_cfg is not None: recipe_cfg.merge_from_dict(data_cfg) @@ -196,6 +197,40 @@ def configure( self._config = recipe_cfg return recipe_cfg + def _configure_device(self, cfg: Config, training: bool): + """Setting device for training and inference.""" + cfg.distributed = False + if torch.distributed.is_initialized(): + cfg.gpu_ids = [int(os.environ["LOCAL_RANK"])] + if training: # TODO multi GPU is available only in training. Evaluation needs to be supported later. + cfg.distributed = True + self.configure_distributed(cfg) + elif "gpu_ids" not in cfg: + gpu_ids = os.environ.get("CUDA_VISIBLE_DEVICES") + logger.info(f"CUDA_VISIBLE_DEVICES = {gpu_ids}") + if gpu_ids is not None: + cfg.gpu_ids = range(len(gpu_ids.split(","))) + else: + cfg.gpu_ids = range(1) + + # consider "cuda" and "cpu" device only + if not torch.cuda.is_available(): + cfg.device = "cpu" + cfg.gpu_ids = range(-1, 0) + else: + cfg.device = "cuda" + + @staticmethod + def configure_distributed(cfg: Config): + """Patching for distributed training.""" + if hasattr(cfg, "dist_params") and cfg.dist_params.get("linear_scale_lr", False): + new_lr = len(cfg.gpu_ids) * cfg.optimizer.lr + logger.info( + f"enabled linear scaling rule to the learning rate. \ + changed LR from {cfg.optimizer.lr} to {new_lr}" + ) + cfg.optimizer.lr = new_lr + # pylint: disable=too-many-branches, too-many-statements def _train_model( self, diff --git a/otx/algorithms/action/task.py b/otx/algorithms/action/task.py index 6ec491dbfc4..f39c6af1ba7 100644 --- a/otx/algorithms/action/task.py +++ b/otx/algorithms/action/task.py @@ -67,6 +67,7 @@ from otx.api.usecases.evaluation.metrics_helper import MetricsHelper from otx.api.usecases.tasks.interfaces.export_interface import ExportType from otx.api.utils.vis_utils import get_actmap +from otx.cli.utils.multi_gpu import is_multigpu_child_process logger = get_logger() @@ -430,6 +431,9 @@ def _generate_training_metrics(learning_curves, scores, metric_name="mAP") -> It def save_model(self, output_model: ModelEntity): """Save best model weights in ActionTrainTask.""" + if is_multigpu_child_process(): + return + logger.info("called save_model") buffer = io.BytesIO() hyperparams_str = ids_to_strings(cfg_helper.convert(self._hyperparams, dict, enum_to_str=True)) diff --git a/otx/algorithms/common/tasks/base_task.py b/otx/algorithms/common/tasks/base_task.py index ed16b416fa3..d0051fc9dd0 100644 --- a/otx/algorithms/common/tasks/base_task.py +++ b/otx/algorithms/common/tasks/base_task.py @@ -15,6 +15,7 @@ # and limitations under the License. import io +import logging import os import shutil import tempfile @@ -136,7 +137,10 @@ def _setup_multigpu_training(): if not dist.is_initialized(): torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) dist.init_process_group(backend="nccl", init_method="env://", timeout=timedelta(seconds=30)) - logger.info(f"Dist info: rank {dist.get_rank()} / {dist.get_world_size()} world_size") + rank = dist.get_rank() + logger.info(f"Dist info: rank {rank} / {dist.get_world_size()} world_size") + if rank != 0: + logging.disable(logging.WARNING) def _get_tmp_dir(self): self._work_dir_is_temp = True diff --git a/otx/cli/tools/train.py b/otx/cli/tools/train.py index 7acbab7c58d..3bbe602521e 100644 --- a/otx/cli/tools/train.py +++ b/otx/cli/tools/train.py @@ -24,7 +24,6 @@ from otx.api.entities.inference_parameters import InferenceParameters from otx.api.entities.model import ModelEntity -from otx.api.entities.model_template import TaskType from otx.api.entities.resultset import ResultSetEntity from otx.api.entities.subset import Subset from otx.api.entities.task_environment import TaskEnvironment @@ -229,9 +228,7 @@ def train(exit_stack: Optional[ExitStack] = None): # pylint: disable=too-many-b if args.gpus: multigpu_manager = MultiGPUManager(train, args.gpus, args.rdzv_endpoint, args.base_rank, args.world_size) - if template.task_type in (TaskType.ACTION_CLASSIFICATION, TaskType.ACTION_DETECTION): - print("Multi-GPU training for action tasks isn't supported yet. A single GPU will be used for a training.") - elif ( + if ( multigpu_manager.is_available() and not template.task_type.is_anomaly # anomaly tasks don't use this way for multi-GPU training ): diff --git a/tests/e2e/cli/action/test_action_classification.py b/tests/e2e/cli/action/test_action_classification.py index 7b21045020e..40796ee9253 100644 --- a/tests/e2e/cli/action/test_action_classification.py +++ b/tests/e2e/cli/action/test_action_classification.py @@ -3,9 +3,11 @@ # SPDX-License-Identifier: Apache-2.0 # +import copy import os import pytest +import torch from otx.api.entities.model_template import parse_model_template from otx.cli.registry import Registry @@ -30,6 +32,7 @@ otx_dir = os.getcwd() +MULTI_GPU_UNAVAILABLE = torch.cuda.device_count() <= 1 TT_STABILITY_TESTS = os.environ.get("TT_STABILITY_TESTS", False) if TT_STABILITY_TESTS: default_template = parse_model_template( @@ -87,3 +90,13 @@ def test_pot_eval(self, template, tmp_dir_path): pytest.skip(reason="[CVS-106939] MoViNet fails with POT") tmp_dir_path = tmp_dir_path / "action_cls" pot_eval_testing(template, tmp_dir_path, otx_dir, args) + + @e2e_pytest_component + @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") + @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train(self, template, tmp_dir_path): + tmp_dir_path = tmp_dir_path / "action_cls/test_multi_gpu" + args1 = copy.deepcopy(args) + args1["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args1) diff --git a/tests/e2e/cli/action/test_action_detection.py b/tests/e2e/cli/action/test_action_detection.py index cb156fc5dc8..2ce73eb80b7 100644 --- a/tests/e2e/cli/action/test_action_detection.py +++ b/tests/e2e/cli/action/test_action_detection.py @@ -3,9 +3,11 @@ # SPDX-License-Identifier: Apache-2.0 # +import copy import os import pytest +import torch from otx.api.entities.model_template import parse_model_template from otx.cli.registry import Registry @@ -30,6 +32,7 @@ otx_dir = os.getcwd() +MULTI_GPU_UNAVAILABLE = torch.cuda.device_count() <= 1 TT_STABILITY_TESTS = os.environ.get("TT_STABILITY_TESTS", False) if TT_STABILITY_TESTS: default_template = parse_model_template( @@ -86,3 +89,13 @@ def test_pot_optimize(self, template, tmp_dir_path): def test_pot_eval(self, template, tmp_dir_path): tmp_dir_path = tmp_dir_path / "action_det" pot_eval_testing(template, tmp_dir_path, otx_dir, args) + + @e2e_pytest_component + @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") + @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train(self, template, tmp_dir_path): + tmp_dir_path = tmp_dir_path / "action_det/test_multi_gpu" + args1 = copy.deepcopy(args) + args1["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args1) diff --git a/tests/integration/cli/action/test_action_classification.py b/tests/integration/cli/action/test_action_classification.py index 550c3d79100..b102e2b2efa 100644 --- a/tests/integration/cli/action/test_action_classification.py +++ b/tests/integration/cli/action/test_action_classification.py @@ -7,6 +7,7 @@ from copy import deepcopy import pytest +import torch from otx.api.entities.model_template import parse_model_template from otx.cli.registry import Registry @@ -29,6 +30,7 @@ otx_dir = os.getcwd() +MULTI_GPU_UNAVAILABLE = torch.cuda.device_count() <= 1 TT_STABILITY_TESTS = os.environ.get("TT_STABILITY_TESTS", False) if TT_STABILITY_TESTS: default_template = parse_model_template( @@ -77,3 +79,13 @@ def test_otx_train_auto_decrease_batch_size(self, template, tmp_dir_path): decrease_bs_args["train_params"].extend(["--learning_parameters.auto_decrease_batch_size", "true"]) tmp_dir_path = tmp_dir_path / "action_cls_auto_decrease_batch_size" otx_train_testing(template, tmp_dir_path, otx_dir, decrease_bs_args) + + @e2e_pytest_component + @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient") + @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS") + @pytest.mark.parametrize("template", templates, ids=templates_ids) + def test_otx_multi_gpu_train(self, template, tmp_dir_path): + tmp_dir_path = tmp_dir_path / "action_cls/test_multi_gpu" + args1 = deepcopy(args) + args1["--gpus"] = "0,1" + otx_train_testing(template, tmp_dir_path, otx_dir, args1)