Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update code to support other features than 'train' on XPU #2704

Merged
10 changes: 7 additions & 3 deletions src/otx/algorithms/common/adapters/mmcv/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from mmcv.utils.path import check_file_exist

from otx.algorithms.common.configs.configuration_enums import InputSizePreset
from otx.algorithms.common.utils import is_xpu_available
from otx.api.entities.datasets import DatasetEntity
from otx.utils.logger import get_logger

Expand Down Expand Up @@ -505,11 +506,14 @@ def patch_persistent_workers(config: Config):

def get_adaptive_num_workers(num_dataloader: int = 1) -> Union[int, None]:
"""Measure appropriate num_workers value and return it."""
num_gpus = torch.cuda.device_count()
if num_gpus == 0:
if is_xpu_available():
num_devices = torch.xpu.device_count()
else:
num_devices = torch.cuda.device_count()
if num_devices == 0:
logger.warning("There is no GPUs. Use existing num_worker value.")
return None
return min(multiprocessing.cpu_count() // (num_dataloader * num_gpus), 8) # max available num_workers is 8
return min(multiprocessing.cpu_count() // (num_dataloader * num_devices), 8) # max available num_workers is 8


def patch_from_hyperparams(config: Config, hyperparams, **kwargs):
Expand Down
58 changes: 0 additions & 58 deletions src/otx/algorithms/detection/adapters/mmdet/apis/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,21 @@
import os

import torch
from mmcv.ops.nms import NMSop
from mmcv.ops.roi_align import RoIAlign
from mmcv.runner import (
DistSamplerSeedHook,
EpochBasedRunner,
OptimizerHook,
build_runner,
get_dist_info,
)
from mmcv.utils import ext_loader
from mmdet.core import DistEvalHook, EvalHook, build_optimizer
from mmdet.datasets import build_dataloader, build_dataset, replace_ImageToTensor
from mmdet.utils import build_ddp, compat_cfg, find_latest_checkpoint, get_root_logger
from mmdet.utils.util_distribution import build_dp, dp_factory
from torchvision.ops import nms as tv_nms
from torchvision.ops import roi_align as tv_roi_align

from otx.algorithms.common.adapters.mmcv.utils import HPUDataParallel, XPUDataParallel
from otx.algorithms.common.adapters.mmcv.utils.hpu_optimizers import HABANA_OPTIMIZERS

ext_module = ext_loader.load_ext("_ext", ["nms", "softnms", "nms_match", "nms_rotated", "nms_quadri"])
dp_factory["xpu"] = XPUDataParallel
dp_factory["hpu"] = HPUDataParallel

Expand Down Expand Up @@ -134,11 +128,6 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
# build optimizer
auto_scale_lr(cfg, distributed, logger)

if cfg.device in ["hpu", "xpu"]:
# dynamic patch for nms and roi_align
NMSop.forward = monkey_patched_nms
RoIAlign.forward = monkey_patched_roi_align

optimizer = build_optimizer(model, cfg.optimizer)

if cfg.device == "xpu":
Expand Down Expand Up @@ -211,50 +200,3 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
runner.run(data_loaders, cfg.workflow)


def monkey_patched_nms(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num):
"""Runs MMCVs NMS with torchvision.nms, or forces NMS from MMCV to run on CPU."""
is_filtering_by_score = score_threshold > 0
if is_filtering_by_score:
valid_mask = scores > score_threshold
bboxes, scores = bboxes[valid_mask], scores[valid_mask]
valid_inds = torch.nonzero(valid_mask, as_tuple=False).squeeze(dim=1)

if bboxes.dtype == torch.bfloat16:
bboxes = bboxes.to(torch.float32)
if scores.dtype == torch.bfloat16:
scores = scores.to(torch.float32)

if offset == 0:
inds = tv_nms(bboxes, scores, float(iou_threshold))
else:
device = bboxes.device
bboxes = bboxes.to("cpu")
scores = scores.to("cpu")
inds = ext_module.nms(bboxes, scores, iou_threshold=float(iou_threshold), offset=offset)
bboxes = bboxes.to(device)
scores = scores.to(device)

if max_num > 0:
inds = inds[:max_num]
if is_filtering_by_score:
inds = valid_inds[inds]
return inds


def monkey_patched_roi_align(self, input, rois):
"""Replaces MMCVs roi align with the one from torchvision.

Args:
self: patched instance
input: NCHW images
rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy.
"""

if "aligned" in tv_roi_align.__code__.co_varnames:
return tv_roi_align(input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, self.aligned)
else:
if self.aligned:
rois -= rois.new_tensor([0.0] + [0.5 / self.spatial_scale] * 4)
return tv_roi_align(input, rois, self.output_size, self.spatial_scale, self.sampling_ratio)
11 changes: 11 additions & 0 deletions src/otx/algorithms/detection/adapters/mmdet/configurer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from typing import Optional, Tuple

from mmcv.ops.nms import NMSop
from mmcv.ops.roi_align import RoIAlign
from mmcv.utils import ConfigDict

from otx.algorithms.common.adapters.mmcv.clsincr_mixin import IncrConfigurerMixin
Expand All @@ -15,6 +17,8 @@
)
from otx.algorithms.detection.adapters.mmdet.utils import (
cluster_anchors,
monkey_patched_nms,
monkey_patched_roi_align,
patch_tiling,
should_cluster_anchors,
)
Expand Down Expand Up @@ -72,6 +76,13 @@ def configure_task(self, cfg, **kwargs):
if self.task_adapt_type == "default_task_adapt":
self.configure_bbox_head(cfg)

def configure_device(self, cfg):
"""Setting device for training and inference."""
super().configure_device(cfg)
if cfg.device in ["xpu", "hpu"]:
NMSop.forward = monkey_patched_nms
RoIAlign.forward = monkey_patched_roi_align

def configure_classes(self, cfg):
"""Patch classes for model and dataset."""
super().configure_classes(cfg)
Expand Down
12 changes: 1 addition & 11 deletions src/otx/algorithms/detection/adapters/mmdet/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
from typing import Any, Dict, Optional, Union

import torch
from mmcv.ops.nms import NMSop
from mmcv.ops.roi_align import RoIAlign
from mmcv.runner import wrap_fp16_model
from mmcv.utils import Config, ConfigDict, get_git_hash
from mmdet import __version__
Expand Down Expand Up @@ -42,11 +40,7 @@
from otx.algorithms.common.configs.training_base import TrainType
from otx.algorithms.common.tasks.nncf_task import NNCFBaseTask
from otx.algorithms.common.utils.data import get_dataset
from otx.algorithms.detection.adapters.mmdet.apis.train import (
monkey_patched_nms,
monkey_patched_roi_align,
train_detector,
)
from otx.algorithms.detection.adapters.mmdet.apis.train import train_detector
from otx.algorithms.detection.adapters.mmdet.configurer import (
DetectionConfigurer,
IncrDetectionConfigurer,
Expand Down Expand Up @@ -348,10 +342,6 @@ def _infer_model(
else:
target_classes = mm_dataset.CLASSES

if cfg.device in ["xpu", "hpu"]:
NMSop.forward = monkey_patched_nms
RoIAlign.forward = monkey_patched_roi_align

# Model
model = self.build_model(cfg, fp16=cfg.get("fp16", False))
model.CLASSES = target_classes
Expand Down
4 changes: 4 additions & 0 deletions src/otx/algorithms/detection/adapters/mmdet/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from .builder import build_detector
from .config_utils import (
cluster_anchors,
monkey_patched_nms,
monkey_patched_roi_align,
patch_input_preprocessing,
patch_input_shape,
patch_ir_scale_factor,
Expand All @@ -21,4 +23,6 @@
"patch_input_shape",
"patch_ir_scale_factor",
"should_cluster_anchors",
"monkey_patched_nms",
"monkey_patched_roi_align",
]
52 changes: 52 additions & 0 deletions src/otx/algorithms/detection/adapters/mmdet/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
# Copyright (C) 2022-2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import torch
from mmcv import Config, ConfigDict
from mmcv.utils import ext_loader
from torchvision.ops import nms as tv_nms
from torchvision.ops import roi_align as tv_roi_align

from otx.algorithms.common.adapters.mmcv.utils import (
InputSizeManager,
Expand Down Expand Up @@ -30,6 +34,7 @@


logger = get_logger()
ext_module = ext_loader.load_ext("_ext", ["nms", "softnms", "nms_match", "nms_rotated", "nms_quadri"])


def should_cluster_anchors(model_cfg: Config):
Expand Down Expand Up @@ -243,3 +248,50 @@ def patch_ir_scale_factor(deploy_cfg: ConfigDict, hyper_parameters: DetectionCon
ConfigDict(opt_shapes=ConfigDict(input=[1, 3, ir_input_shape[2], ir_input_shape[3]]))
]
print(f"-----------------> x {tile_ir_scale_factor} = {ir_input_shape}")


def monkey_patched_nms(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num):
"""Runs MMCVs NMS with torchvision.nms, or forces NMS from MMCV to run on CPU."""
is_filtering_by_score = score_threshold > 0
if is_filtering_by_score:
valid_mask = scores > score_threshold
bboxes, scores = bboxes[valid_mask], scores[valid_mask]
valid_inds = torch.nonzero(valid_mask, as_tuple=False).squeeze(dim=1)

if bboxes.dtype == torch.bfloat16:
bboxes = bboxes.to(torch.float32)
if scores.dtype == torch.bfloat16:
scores = scores.to(torch.float32)

if offset == 0:
inds = tv_nms(bboxes, scores, float(iou_threshold))
else:
device = bboxes.device
bboxes = bboxes.to("cpu")
scores = scores.to("cpu")
inds = ext_module.nms(bboxes, scores, iou_threshold=float(iou_threshold), offset=offset)
bboxes = bboxes.to(device)
scores = scores.to(device)

if max_num > 0:
inds = inds[:max_num]
if is_filtering_by_score:
inds = valid_inds[inds]
return inds


def monkey_patched_roi_align(self, input, rois):
"""Replaces MMCVs roi align with the one from torchvision.

Args:
self: patched instance
input: NCHW images
rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy.
"""

if "aligned" in tv_roi_align.__code__.co_varnames:
return tv_roi_align(input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, self.aligned)
else:
if self.aligned:
rois -= rois.new_tensor([0.0] + [0.5 / self.spatial_scale] * 4)
return tv_roi_align(input, rois, self.output_size, self.spatial_scale, self.sampling_ratio)
15 changes: 13 additions & 2 deletions src/otx/cli/utils/hpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import torch
import yaml

from otx.algorithms.common.utils import is_xpu_available
from otx.api.configuration.helper import create
from otx.api.entities.datasets import DatasetEntity
from otx.api.entities.model import ModelEntity
Expand Down Expand Up @@ -459,7 +460,12 @@ def run_hpo(self, train_func: Callable, data_roots: Dict[str, Dict]) -> Union[Di
"""
self._environment.save_initial_weight(self._get_initial_model_weight_path())
hpo_algo = self._get_hpo_algo()
resource_type = "gpu" if torch.cuda.is_available() else "cpu"
if torch.cuda.is_available():
resource_type = "gpu"
elif is_xpu_available():
resource_type = "xpu"
else:
resource_type = "cpu"
run_hpo_loop(
hpo_algo,
partial(
Expand Down Expand Up @@ -497,6 +503,11 @@ def _get_hpo_algo(self):
return hpo_algo

def _prepare_asha(self):
if is_xpu_available():
asynchronous_sha = torch.xpu.device_count() != 1
else:
asynchronous_sha = torch.cuda.device_count() != 1

args = {
"search_space": self._hpo_config["hp_space"],
"save_path": str(self._hpo_workdir),
Expand All @@ -511,7 +522,7 @@ def _prepare_asha(self):
"expected_time_ratio": self._hpo_time_ratio,
"prior_hyper_parameters": self._get_default_hyper_parameters(),
"asynchronous_bracket": True,
"asynchronous_sha": torch.cuda.device_count() != 1,
"asynchronous_sha": asynchronous_sha,
}

logger.debug(f"ASHA args = {args}")
Expand Down
42 changes: 24 additions & 18 deletions src/otx/hpo/hpo_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,24 +47,26 @@ class HpoLoop:
Args:
hpo_algo (HpoBase): HPO algorithms.
train_func (Callable): Function to train a model.
resource_type (Literal['gpu', 'cpu'], optional): Which type of resource to use.
resource_type (Literal['gpu', 'cpu', 'xpu'], optional): Which type of resource to use.
If can be changed depending on environment. Defaults to "gpu".
num_parallel_trial (Optional[int], optional): How many trials to run in parallel.
It's used for CPUResourceManager. Defaults to None.
num_gpu_for_single_trial (Optional[int], optional): How many GPUs are used for a single trial.
It's used for GPUResourceManager. Defaults to None.
available_gpu (Optional[str], optional): How many GPUs are available. It's used for GPUResourceManager.
Defaults to None.
num_devices_per_trial (Optional[int], optional): Number of devices used for a single trial.
It's used for GPUResourceManager and XPUResourceManager.
Defaults to None.
available_devices (Optional[str], optional): Number of devices available.
It's used for GPUResourceManager and XPUResourceManager.
Defaults to None.
"""

def __init__(
self,
hpo_algo: HpoBase,
train_func: Callable,
resource_type: Literal["gpu", "cpu"] = "gpu",
resource_type: Literal["gpu", "cpu", "xpu"] = "gpu",
num_parallel_trial: Optional[int] = None,
num_gpu_for_single_trial: Optional[int] = None,
available_gpu: Optional[str] = None,
num_devices_per_trial: Optional[int] = None,
available_devices: Optional[str] = None,
):
self._hpo_algo = hpo_algo
self._train_func = train_func
Expand All @@ -74,7 +76,7 @@ def __init__(
self._uid_index = 0
self._trial_fault_count = 0
self._resource_manager = get_resource_manager(
resource_type, num_parallel_trial, num_gpu_for_single_trial, available_gpu
resource_type, num_parallel_trial, num_devices_per_trial, available_devices
)
self._main_pid = os.getpid()

Expand Down Expand Up @@ -228,24 +230,28 @@ def _report_score(
def run_hpo_loop(
hpo_algo: HpoBase,
train_func: Callable,
resource_type: Literal["gpu", "cpu"] = "gpu",
resource_type: Literal["gpu", "cpu", "xpu"] = "gpu",
num_parallel_trial: Optional[int] = None,
num_gpu_for_single_trial: Optional[int] = None,
available_gpu: Optional[str] = None,
num_devices_per_trial: Optional[int] = None,
available_devices: Optional[str] = None,
):
"""Run the HPO loop.

Args:
hpo_algo (HpoBase): HPO algorithms.
train_func (Callable): Function to train a model.
resource_type (Literal['gpu', 'cpu'], optional): Which type of resource to use.
resource_type (Literal['gpu', 'cpu', 'xpu'], optional): Which type of resource to use.
If can be changed depending on environment. Defaults to "gpu".
num_parallel_trial (Optional[int], optional): How many trials to run in parallel.
It's used for CPUResourceManager. Defaults to None.
num_gpu_for_single_trial (Optional[int], optional): How many GPUs are used for a single trial.
It's used for GPUResourceManager. Defaults to None.
available_gpu (Optional[str], optional): How many GPUs are available. It's used for GPUResourceManager.
Defaults to None.
num_devices_per_trial (Optional[int], optional): Number of devices used for a single trial.
It's used for GPUResourceManager and XPUResourceManager.
Defaults to None.
available_devices (Optional[str], optional): Number of devices available.
It's used for GPUResourceManager and XPUResourceManager.
Defaults to None.
"""
hpo_loop = HpoLoop(hpo_algo, train_func, resource_type, num_parallel_trial, num_gpu_for_single_trial, available_gpu)
hpo_loop = HpoLoop(
hpo_algo, train_func, resource_type, num_parallel_trial, num_devices_per_trial, available_devices
)
hpo_loop.run()
Loading