openvinotoolkit · sovrasov · Dec 13, 2023 · Dec 4, 2023 · Dec 4, 2023 · Dec 5, 2023
diff --git a/src/otx/algorithms/common/adapters/mmcv/utils/config_utils.py b/src/otx/algorithms/common/adapters/mmcv/utils/config_utils.py
@@ -25,6 +25,7 @@
 from mmcv.utils.path import check_file_exist
 
 from otx.algorithms.common.configs.configuration_enums import InputSizePreset
+from otx.algorithms.common.utils import is_xpu_available
 from otx.api.entities.datasets import DatasetEntity
 from otx.utils.logger import get_logger
 
@@ -505,11 +506,14 @@ def patch_persistent_workers(config: Config):
 
 def get_adaptive_num_workers(num_dataloader: int = 1) -> Union[int, None]:
     """Measure appropriate num_workers value and return it."""
-    num_gpus = torch.cuda.device_count()
-    if num_gpus == 0:
+    if is_xpu_available():
+        num_devices = torch.xpu.device_count()
+    else:
+        num_devices = torch.cuda.device_count()
+    if num_devices == 0:
         logger.warning("There is no GPUs. Use existing num_worker value.")
         return None
-    return min(multiprocessing.cpu_count() // (num_dataloader * num_gpus), 8)  # max available num_workers is 8
+    return min(multiprocessing.cpu_count() // (num_dataloader * num_devices), 8)  # max available num_workers is 8
 
 
 def patch_from_hyperparams(config: Config, hyperparams, **kwargs):

diff --git a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
@@ -7,27 +7,21 @@
 import os
 
 import torch
-from mmcv.ops.nms import NMSop
-from mmcv.ops.roi_align import RoIAlign
 from mmcv.runner import (
     DistSamplerSeedHook,
     EpochBasedRunner,
     OptimizerHook,
     build_runner,
     get_dist_info,
 )
-from mmcv.utils import ext_loader
 from mmdet.core import DistEvalHook, EvalHook, build_optimizer
 from mmdet.datasets import build_dataloader, build_dataset, replace_ImageToTensor
 from mmdet.utils import build_ddp, compat_cfg, find_latest_checkpoint, get_root_logger
 from mmdet.utils.util_distribution import build_dp, dp_factory
-from torchvision.ops import nms as tv_nms
-from torchvision.ops import roi_align as tv_roi_align
 
 from otx.algorithms.common.adapters.mmcv.utils import HPUDataParallel, XPUDataParallel
 from otx.algorithms.common.adapters.mmcv.utils.hpu_optimizers import HABANA_OPTIMIZERS
 
-ext_module = ext_loader.load_ext("_ext", ["nms", "softnms", "nms_match", "nms_rotated", "nms_quadri"])
 dp_factory["xpu"] = XPUDataParallel
 dp_factory["hpu"] = HPUDataParallel
 
@@ -134,11 +128,6 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
     # build optimizer
     auto_scale_lr(cfg, distributed, logger)
 
-    if cfg.device in ["hpu", "xpu"]:
-        # dynamic patch for nms and roi_align
-        NMSop.forward = monkey_patched_nms
-        RoIAlign.forward = monkey_patched_roi_align
-
     optimizer = build_optimizer(model, cfg.optimizer)
 
     if cfg.device == "xpu":
@@ -211,50 +200,3 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
     elif cfg.load_from:
         runner.load_checkpoint(cfg.load_from)
     runner.run(data_loaders, cfg.workflow)
-
-
-def monkey_patched_nms(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num):
-    """Runs MMCVs NMS with torchvision.nms, or forces NMS from MMCV to run on CPU."""
-    is_filtering_by_score = score_threshold > 0
-    if is_filtering_by_score:
-        valid_mask = scores > score_threshold
-        bboxes, scores = bboxes[valid_mask], scores[valid_mask]
-        valid_inds = torch.nonzero(valid_mask, as_tuple=False).squeeze(dim=1)
-
-    if bboxes.dtype == torch.bfloat16:
-        bboxes = bboxes.to(torch.float32)
-    if scores.dtype == torch.bfloat16:
-        scores = scores.to(torch.float32)
-
-    if offset == 0:
-        inds = tv_nms(bboxes, scores, float(iou_threshold))
-    else:
-        device = bboxes.device
-        bboxes = bboxes.to("cpu")
-        scores = scores.to("cpu")
-        inds = ext_module.nms(bboxes, scores, iou_threshold=float(iou_threshold), offset=offset)
-        bboxes = bboxes.to(device)
-        scores = scores.to(device)
-
-    if max_num > 0:
-        inds = inds[:max_num]
-    if is_filtering_by_score:
-        inds = valid_inds[inds]
-    return inds
-
-
-def monkey_patched_roi_align(self, input, rois):
-    """Replaces MMCVs roi align with the one from torchvision.
-
-    Args:
-        self: patched instance
-        input: NCHW images
-        rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy.
-    """
-
-    if "aligned" in tv_roi_align.__code__.co_varnames:
-        return tv_roi_align(input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, self.aligned)
-    else:
-        if self.aligned:
-            rois -= rois.new_tensor([0.0] + [0.5 / self.spatial_scale] * 4)
-        return tv_roi_align(input, rois, self.output_size, self.spatial_scale, self.sampling_ratio)
diff --git a/src/otx/algorithms/detection/adapters/mmdet/configurer.py b/src/otx/algorithms/detection/adapters/mmdet/configurer.py
@@ -5,6 +5,8 @@
 
 from typing import Optional, Tuple
 
+from mmcv.ops.nms import NMSop
+from mmcv.ops.roi_align import RoIAlign
 from mmcv.utils import ConfigDict
 
 from otx.algorithms.common.adapters.mmcv.clsincr_mixin import IncrConfigurerMixin
@@ -15,6 +17,8 @@
 )
 from otx.algorithms.detection.adapters.mmdet.utils import (
     cluster_anchors,
+    monkey_patched_nms,
+    monkey_patched_roi_align,
     patch_tiling,
     should_cluster_anchors,
 )
@@ -72,6 +76,13 @@ def configure_task(self, cfg, **kwargs):
             if self.task_adapt_type == "default_task_adapt":
                 self.configure_bbox_head(cfg)
 
+    def configure_device(self, cfg):
+        """Setting device for training and inference."""
+        super().configure_device(cfg)
+        if cfg.device in ["xpu", "hpu"]:
+            NMSop.forward = monkey_patched_nms
+            RoIAlign.forward = monkey_patched_roi_align
+
     def configure_classes(self, cfg):
         """Patch classes for model and dataset."""
         super().configure_classes(cfg)

diff --git a/src/otx/algorithms/detection/adapters/mmdet/task.py b/src/otx/algorithms/detection/adapters/mmdet/task.py
@@ -13,8 +13,6 @@
 from typing import Any, Dict, Optional, Union
 
 import torch
-from mmcv.ops.nms import NMSop
-from mmcv.ops.roi_align import RoIAlign
 from mmcv.runner import wrap_fp16_model
 from mmcv.utils import Config, ConfigDict, get_git_hash
 from mmdet import __version__
@@ -42,11 +40,7 @@
 from otx.algorithms.common.configs.training_base import TrainType
 from otx.algorithms.common.tasks.nncf_task import NNCFBaseTask
 from otx.algorithms.common.utils.data import get_dataset
-from otx.algorithms.detection.adapters.mmdet.apis.train import (
-    monkey_patched_nms,
-    monkey_patched_roi_align,
-    train_detector,
-)
+from otx.algorithms.detection.adapters.mmdet.apis.train import train_detector
 from otx.algorithms.detection.adapters.mmdet.configurer import (
     DetectionConfigurer,
     IncrDetectionConfigurer,
@@ -348,10 +342,6 @@ def _infer_model(
         else:
             target_classes = mm_dataset.CLASSES
 
-        if cfg.device in ["xpu", "hpu"]:
-            NMSop.forward = monkey_patched_nms
-            RoIAlign.forward = monkey_patched_roi_align
-
         # Model
         model = self.build_model(cfg, fp16=cfg.get("fp16", False))
         model.CLASSES = target_classes

diff --git a/src/otx/algorithms/detection/adapters/mmdet/utils/__init__.py b/src/otx/algorithms/detection/adapters/mmdet/utils/__init__.py
@@ -6,6 +6,8 @@
 from .builder import build_detector
 from .config_utils import (
     cluster_anchors,
+    monkey_patched_nms,
+    monkey_patched_roi_align,
     patch_input_preprocessing,
     patch_input_shape,
     patch_ir_scale_factor,
@@ -21,4 +23,6 @@
     "patch_input_shape",
     "patch_ir_scale_factor",
     "should_cluster_anchors",
+    "monkey_patched_nms",
+    "monkey_patched_roi_align",
 ]
diff --git a/src/otx/algorithms/detection/adapters/mmdet/utils/config_utils.py b/src/otx/algorithms/detection/adapters/mmdet/utils/config_utils.py
@@ -2,7 +2,11 @@
 # Copyright (C) 2022-2023 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import torch
 from mmcv import Config, ConfigDict
+from mmcv.utils import ext_loader
+from torchvision.ops import nms as tv_nms
+from torchvision.ops import roi_align as tv_roi_align
 
 from otx.algorithms.common.adapters.mmcv.utils import (
     InputSizeManager,
@@ -30,6 +34,7 @@
 
 
 logger = get_logger()
+ext_module = ext_loader.load_ext("_ext", ["nms", "softnms", "nms_match", "nms_rotated", "nms_quadri"])
 
 
 def should_cluster_anchors(model_cfg: Config):
@@ -243,3 +248,50 @@ def patch_ir_scale_factor(deploy_cfg: ConfigDict, hyper_parameters: DetectionCon
                 ConfigDict(opt_shapes=ConfigDict(input=[1, 3, ir_input_shape[2], ir_input_shape[3]]))
             ]
             print(f"-----------------> x {tile_ir_scale_factor} = {ir_input_shape}")
+
+
+def monkey_patched_nms(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num):
+    """Runs MMCVs NMS with torchvision.nms, or forces NMS from MMCV to run on CPU."""
+    is_filtering_by_score = score_threshold > 0
+    if is_filtering_by_score:
+        valid_mask = scores > score_threshold
+        bboxes, scores = bboxes[valid_mask], scores[valid_mask]
+        valid_inds = torch.nonzero(valid_mask, as_tuple=False).squeeze(dim=1)
+
+    if bboxes.dtype == torch.bfloat16:
+        bboxes = bboxes.to(torch.float32)
+    if scores.dtype == torch.bfloat16:
+        scores = scores.to(torch.float32)
+
+    if offset == 0:
+        inds = tv_nms(bboxes, scores, float(iou_threshold))
+    else:
+        device = bboxes.device
+        bboxes = bboxes.to("cpu")
+        scores = scores.to("cpu")
+        inds = ext_module.nms(bboxes, scores, iou_threshold=float(iou_threshold), offset=offset)
+        bboxes = bboxes.to(device)
+        scores = scores.to(device)
+
+    if max_num > 0:
+        inds = inds[:max_num]
+    if is_filtering_by_score:
+        inds = valid_inds[inds]
+    return inds
+
+
+def monkey_patched_roi_align(self, input, rois):
+    """Replaces MMCVs roi align with the one from torchvision.
+
+    Args:
+        self: patched instance
+        input: NCHW images
+        rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy.
+    """
+
+    if "aligned" in tv_roi_align.__code__.co_varnames:
+        return tv_roi_align(input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, self.aligned)
+    else:
+        if self.aligned:
+            rois -= rois.new_tensor([0.0] + [0.5 / self.spatial_scale] * 4)
+        return tv_roi_align(input, rois, self.output_size, self.spatial_scale, self.sampling_ratio)
diff --git a/src/otx/cli/utils/hpo.py b/src/otx/cli/utils/hpo.py
@@ -19,6 +19,7 @@
 import torch
 import yaml
 
+from otx.algorithms.common.utils import is_xpu_available
 from otx.api.configuration.helper import create
 from otx.api.entities.datasets import DatasetEntity
 from otx.api.entities.model import ModelEntity
@@ -459,7 +460,12 @@ def run_hpo(self, train_func: Callable, data_roots: Dict[str, Dict]) -> Union[Di
         """
         self._environment.save_initial_weight(self._get_initial_model_weight_path())
         hpo_algo = self._get_hpo_algo()
-        resource_type = "gpu" if torch.cuda.is_available() else "cpu"
+        if torch.cuda.is_available():
+            resource_type = "gpu"
+        elif is_xpu_available():
+            resource_type = "xpu"
+        else:
+            resource_type = "cpu"
         run_hpo_loop(
             hpo_algo,
             partial(
@@ -497,6 +503,11 @@ def _get_hpo_algo(self):
         return hpo_algo
 
     def _prepare_asha(self):
+        if is_xpu_available():
+            asynchronous_sha = torch.xpu.device_count() != 1
+        else:
+            asynchronous_sha = torch.cuda.device_count() != 1
+
         args = {
             "search_space": self._hpo_config["hp_space"],
             "save_path": str(self._hpo_workdir),
@@ -511,7 +522,7 @@ def _prepare_asha(self):
             "expected_time_ratio": self._hpo_time_ratio,
             "prior_hyper_parameters": self._get_default_hyper_parameters(),
             "asynchronous_bracket": True,
-            "asynchronous_sha": torch.cuda.device_count() != 1,
+            "asynchronous_sha": asynchronous_sha,
         }
 
         logger.debug(f"ASHA args = {args}")

diff --git a/src/otx/hpo/hpo_runner.py b/src/otx/hpo/hpo_runner.py
@@ -47,24 +47,26 @@ class HpoLoop:
     Args:
         hpo_algo (HpoBase): HPO algorithms.
         train_func (Callable): Function to train a model.
-        resource_type (Literal['gpu', 'cpu'], optional): Which type of resource to use.
+        resource_type (Literal['gpu', 'cpu', 'xpu'], optional): Which type of resource to use.
                                                          If can be changed depending on environment. Defaults to "gpu".
         num_parallel_trial (Optional[int], optional): How many trials to run in parallel.
                                                     It's used for CPUResourceManager. Defaults to None.
-        num_gpu_for_single_trial (Optional[int], optional): How many GPUs are used for a single trial.
-                                                            It's used for GPUResourceManager. Defaults to None.
-        available_gpu (Optional[str], optional): How many GPUs are available. It's used for GPUResourceManager.
-                                                 Defaults to None.
+        num_devices_per_trial (Optional[int], optional): Number of devices used for a single trial.
+                                                            It's used for GPUResourceManager and XPUResourceManager.
+                                                            Defaults to None.
+        available_devices (Optional[str], optional): Number of devices available.
+                                                    It's used for GPUResourceManager and XPUResourceManager.
+                                                    Defaults to None.
     """
 
     def __init__(
         self,
         hpo_algo: HpoBase,
         train_func: Callable,
-        resource_type: Literal["gpu", "cpu"] = "gpu",
+        resource_type: Literal["gpu", "cpu", "xpu"] = "gpu",
         num_parallel_trial: Optional[int] = None,
-        num_gpu_for_single_trial: Optional[int] = None,
-        available_gpu: Optional[str] = None,
+        num_devices_per_trial: Optional[int] = None,
+        available_devices: Optional[str] = None,
     ):
         self._hpo_algo = hpo_algo
         self._train_func = train_func
@@ -74,7 +76,7 @@ def __init__(
         self._uid_index = 0
         self._trial_fault_count = 0
         self._resource_manager = get_resource_manager(
-            resource_type, num_parallel_trial, num_gpu_for_single_trial, available_gpu
+            resource_type, num_parallel_trial, num_devices_per_trial, available_devices
         )
         self._main_pid = os.getpid()
 
@@ -228,24 +230,28 @@ def _report_score(
 def run_hpo_loop(
     hpo_algo: HpoBase,
     train_func: Callable,
-    resource_type: Literal["gpu", "cpu"] = "gpu",
+    resource_type: Literal["gpu", "cpu", "xpu"] = "gpu",
     num_parallel_trial: Optional[int] = None,
-    num_gpu_for_single_trial: Optional[int] = None,
-    available_gpu: Optional[str] = None,
+    num_devices_per_trial: Optional[int] = None,
+    available_devices: Optional[str] = None,
 ):
     """Run the HPO loop.
 
     Args:
         hpo_algo (HpoBase): HPO algorithms.
         train_func (Callable): Function to train a model.
-        resource_type (Literal['gpu', 'cpu'], optional): Which type of resource to use.
+        resource_type (Literal['gpu', 'cpu', 'xpu'], optional): Which type of resource to use.
                                                          If can be changed depending on environment. Defaults to "gpu".
         num_parallel_trial (Optional[int], optional): How many trials to run in parallel.
                                                       It's used for CPUResourceManager. Defaults to None.
-        num_gpu_for_single_trial (Optional[int], optional): How many GPUs are used for a single trial.
-                                                            It's used for GPUResourceManager. Defaults to None.
-        available_gpu (Optional[str], optional): How many GPUs are available. It's used for GPUResourceManager.
-                                                 Defaults to None.
+        num_devices_per_trial (Optional[int], optional): Number of devices used for a single trial.
+                                                            It's used for GPUResourceManager and XPUResourceManager.
+                                                            Defaults to None.
+        available_devices (Optional[str], optional): Number of devices available.
+                                                    It's used for GPUResourceManager and XPUResourceManager.
+                                                    Defaults to None.
     """
-    hpo_loop = HpoLoop(hpo_algo, train_func, resource_type, num_parallel_trial, num_gpu_for_single_trial, available_gpu)
+    hpo_loop = HpoLoop(
+        hpo_algo, train_func, resource_type, num_parallel_trial, num_devices_per_trial, available_devices
+    )
     hpo_loop.run()