diff --git a/CHANGELOG.md b/CHANGELOG.md
index 304b907cca0..4d068bd23a8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,7 +6,7 @@ All notable changes to this project will be documented in this file.
 
 ### New features
 
--
+- Action task supports multi GPU training. (<https://github.com/openvinotoolkit/training_extensions/pull/2057>)
 
 ### Enhancements
 
diff --git a/otx/algorithms/action/adapters/mmaction/task.py b/otx/algorithms/action/adapters/mmaction/task.py
index a4815b99113..c889224545c 100644
--- a/otx/algorithms/action/adapters/mmaction/task.py
+++ b/otx/algorithms/action/adapters/mmaction/task.py
@@ -158,9 +158,10 @@ def configure(
 
         recipe_cfg.work_dir = self._output_path
         recipe_cfg.resume = self._resume
-        recipe_cfg.distributed = False
         recipe_cfg.omnisource = False
 
+        self._configure_device(recipe_cfg, training)
+
         if data_cfg is not None:
             recipe_cfg.merge_from_dict(data_cfg)
 
@@ -196,6 +197,40 @@ def configure(
         self._config = recipe_cfg
         return recipe_cfg
 
+    def _configure_device(self, cfg: Config, training: bool):
+        """Setting device for training and inference."""
+        cfg.distributed = False
+        if torch.distributed.is_initialized():
+            cfg.gpu_ids = [int(os.environ["LOCAL_RANK"])]
+            if training:  # TODO multi GPU is available only in training. Evaluation needs to be supported later.
+                cfg.distributed = True
+                self.configure_distributed(cfg)
+        elif "gpu_ids" not in cfg:
+            gpu_ids = os.environ.get("CUDA_VISIBLE_DEVICES")
+            logger.info(f"CUDA_VISIBLE_DEVICES = {gpu_ids}")
+            if gpu_ids is not None:
+                cfg.gpu_ids = range(len(gpu_ids.split(",")))
+            else:
+                cfg.gpu_ids = range(1)
+
+        # consider "cuda" and "cpu" device only
+        if not torch.cuda.is_available():
+            cfg.device = "cpu"
+            cfg.gpu_ids = range(-1, 0)
+        else:
+            cfg.device = "cuda"
+
+    @staticmethod
+    def configure_distributed(cfg: Config):
+        """Patching for distributed training."""
+        if hasattr(cfg, "dist_params") and cfg.dist_params.get("linear_scale_lr", False):
+            new_lr = len(cfg.gpu_ids) * cfg.optimizer.lr
+            logger.info(
+                f"enabled linear scaling rule to the learning rate. \
+                changed LR from {cfg.optimizer.lr} to {new_lr}"
+            )
+            cfg.optimizer.lr = new_lr
+
     # pylint: disable=too-many-branches, too-many-statements
     def _train_model(
         self,
diff --git a/otx/algorithms/action/task.py b/otx/algorithms/action/task.py
index 6ec491dbfc4..f39c6af1ba7 100644
--- a/otx/algorithms/action/task.py
+++ b/otx/algorithms/action/task.py
@@ -67,6 +67,7 @@
 from otx.api.usecases.evaluation.metrics_helper import MetricsHelper
 from otx.api.usecases.tasks.interfaces.export_interface import ExportType
 from otx.api.utils.vis_utils import get_actmap
+from otx.cli.utils.multi_gpu import is_multigpu_child_process
 
 logger = get_logger()
 
@@ -430,6 +431,9 @@ def _generate_training_metrics(learning_curves, scores, metric_name="mAP") -> It
 
     def save_model(self, output_model: ModelEntity):
         """Save best model weights in ActionTrainTask."""
+        if is_multigpu_child_process():
+            return
+
         logger.info("called save_model")
         buffer = io.BytesIO()
         hyperparams_str = ids_to_strings(cfg_helper.convert(self._hyperparams, dict, enum_to_str=True))
diff --git a/otx/algorithms/common/tasks/base_task.py b/otx/algorithms/common/tasks/base_task.py
index ed16b416fa3..d0051fc9dd0 100644
--- a/otx/algorithms/common/tasks/base_task.py
+++ b/otx/algorithms/common/tasks/base_task.py
@@ -15,6 +15,7 @@
 # and limitations under the License.
 
 import io
+import logging
 import os
 import shutil
 import tempfile
@@ -136,7 +137,10 @@ def _setup_multigpu_training():
         if not dist.is_initialized():
             torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
             dist.init_process_group(backend="nccl", init_method="env://", timeout=timedelta(seconds=30))
-            logger.info(f"Dist info: rank {dist.get_rank()} / {dist.get_world_size()} world_size")
+            rank = dist.get_rank()
+            logger.info(f"Dist info: rank {rank} / {dist.get_world_size()} world_size")
+            if rank != 0:
+                logging.disable(logging.WARNING)
 
     def _get_tmp_dir(self):
         self._work_dir_is_temp = True
diff --git a/otx/cli/tools/train.py b/otx/cli/tools/train.py
index 7acbab7c58d..3bbe602521e 100644
--- a/otx/cli/tools/train.py
+++ b/otx/cli/tools/train.py
@@ -24,7 +24,6 @@
 
 from otx.api.entities.inference_parameters import InferenceParameters
 from otx.api.entities.model import ModelEntity
-from otx.api.entities.model_template import TaskType
 from otx.api.entities.resultset import ResultSetEntity
 from otx.api.entities.subset import Subset
 from otx.api.entities.task_environment import TaskEnvironment
@@ -229,9 +228,7 @@ def train(exit_stack: Optional[ExitStack] = None):  # pylint: disable=too-many-b
 
     if args.gpus:
         multigpu_manager = MultiGPUManager(train, args.gpus, args.rdzv_endpoint, args.base_rank, args.world_size)
-        if template.task_type in (TaskType.ACTION_CLASSIFICATION, TaskType.ACTION_DETECTION):
-            print("Multi-GPU training for action tasks isn't supported yet. A single GPU will be used for a training.")
-        elif (
+        if (
             multigpu_manager.is_available()
             and not template.task_type.is_anomaly  # anomaly tasks don't use this way for multi-GPU training
         ):
diff --git a/tests/e2e/cli/action/test_action_classification.py b/tests/e2e/cli/action/test_action_classification.py
index 7b21045020e..40796ee9253 100644
--- a/tests/e2e/cli/action/test_action_classification.py
+++ b/tests/e2e/cli/action/test_action_classification.py
@@ -3,9 +3,11 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
+import copy
 import os
 
 import pytest
+import torch
 
 from otx.api.entities.model_template import parse_model_template
 from otx.cli.registry import Registry
@@ -30,6 +32,7 @@
 
 otx_dir = os.getcwd()
 
+MULTI_GPU_UNAVAILABLE = torch.cuda.device_count() <= 1
 TT_STABILITY_TESTS = os.environ.get("TT_STABILITY_TESTS", False)
 if TT_STABILITY_TESTS:
     default_template = parse_model_template(
@@ -87,3 +90,13 @@ def test_pot_eval(self, template, tmp_dir_path):
             pytest.skip(reason="[CVS-106939] MoViNet fails with POT")
         tmp_dir_path = tmp_dir_path / "action_cls"
         pot_eval_testing(template, tmp_dir_path, otx_dir, args)
+
+    @e2e_pytest_component
+    @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
+    @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train(self, template, tmp_dir_path):
+        tmp_dir_path = tmp_dir_path / "action_cls/test_multi_gpu"
+        args1 = copy.deepcopy(args)
+        args1["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args1)
diff --git a/tests/e2e/cli/action/test_action_detection.py b/tests/e2e/cli/action/test_action_detection.py
index cb156fc5dc8..2ce73eb80b7 100644
--- a/tests/e2e/cli/action/test_action_detection.py
+++ b/tests/e2e/cli/action/test_action_detection.py
@@ -3,9 +3,11 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
+import copy
 import os
 
 import pytest
+import torch
 
 from otx.api.entities.model_template import parse_model_template
 from otx.cli.registry import Registry
@@ -30,6 +32,7 @@
 
 otx_dir = os.getcwd()
 
+MULTI_GPU_UNAVAILABLE = torch.cuda.device_count() <= 1
 TT_STABILITY_TESTS = os.environ.get("TT_STABILITY_TESTS", False)
 if TT_STABILITY_TESTS:
     default_template = parse_model_template(
@@ -86,3 +89,13 @@ def test_pot_optimize(self, template, tmp_dir_path):
     def test_pot_eval(self, template, tmp_dir_path):
         tmp_dir_path = tmp_dir_path / "action_det"
         pot_eval_testing(template, tmp_dir_path, otx_dir, args)
+
+    @e2e_pytest_component
+    @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
+    @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train(self, template, tmp_dir_path):
+        tmp_dir_path = tmp_dir_path / "action_det/test_multi_gpu"
+        args1 = copy.deepcopy(args)
+        args1["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args1)
diff --git a/tests/integration/cli/action/test_action_classification.py b/tests/integration/cli/action/test_action_classification.py
index 550c3d79100..b102e2b2efa 100644
--- a/tests/integration/cli/action/test_action_classification.py
+++ b/tests/integration/cli/action/test_action_classification.py
@@ -7,6 +7,7 @@
 from copy import deepcopy
 
 import pytest
+import torch
 
 from otx.api.entities.model_template import parse_model_template
 from otx.cli.registry import Registry
@@ -29,6 +30,7 @@
 
 otx_dir = os.getcwd()
 
+MULTI_GPU_UNAVAILABLE = torch.cuda.device_count() <= 1
 TT_STABILITY_TESTS = os.environ.get("TT_STABILITY_TESTS", False)
 if TT_STABILITY_TESTS:
     default_template = parse_model_template(
@@ -77,3 +79,13 @@ def test_otx_train_auto_decrease_batch_size(self, template, tmp_dir_path):
         decrease_bs_args["train_params"].extend(["--learning_parameters.auto_decrease_batch_size", "true"])
         tmp_dir_path = tmp_dir_path / "action_cls_auto_decrease_batch_size"
         otx_train_testing(template, tmp_dir_path, otx_dir, decrease_bs_args)
+
+    @e2e_pytest_component
+    @pytest.mark.skipif(MULTI_GPU_UNAVAILABLE, reason="The number of gpu is insufficient")
+    @pytest.mark.skipif(TT_STABILITY_TESTS, reason="This is TT_STABILITY_TESTS")
+    @pytest.mark.parametrize("template", templates, ids=templates_ids)
+    def test_otx_multi_gpu_train(self, template, tmp_dir_path):
+        tmp_dir_path = tmp_dir_path / "action_cls/test_multi_gpu"
+        args1 = deepcopy(args)
+        args1["--gpus"] = "0,1"
+        otx_train_testing(template, tmp_dir_path, otx_dir, args1)