openvinotoolkit · eunwoosh · Nov 10, 2023 · Oct 31, 2023 · Oct 31, 2023 · Nov 6, 2023
diff --git a/src/otx/algorithms/common/adapters/mmcv/configurer.py b/src/otx/algorithms/common/adapters/mmcv/configurer.py
@@ -176,7 +176,7 @@ def configure_device(self, cfg):
         elif "gpu_ids" not in cfg:
             cfg.gpu_ids = range(1)
 
-        # consider "cuda", "hpu" and "cpu" device only
+        # consider "cuda", "xpu", "hpu" and "cpu" device only
         if is_hpu_available():
             cfg.device = "hpu"
         elif torch.cuda.is_available():

diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py b/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py
@@ -23,6 +23,7 @@
 from torch.nn import LayerNorm
 
 from otx.algorithms.classification import MMCLS_AVAILABLE
+from otx.algorithms.common.utils.utils import cast_bf16_to_fp32
 
 if MMCLS_AVAILABLE:
     from mmcls.models.necks.gap import GlobalAveragePooling
@@ -74,7 +75,7 @@ def _recording_forward(
     ):  # pylint: disable=unused-argument
         tensors = self.func(output)
         if isinstance(tensors, torch.Tensor):
-            tensors_np = tensors.detach().cpu().numpy()
+            tensors_np = cast_bf16_to_fp32(tensors).detach().cpu().numpy()
         elif isinstance(tensors, np.ndarray):
             tensors_np = tensors
         else:

diff --git a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
@@ -64,7 +64,7 @@ def build_data_parallel(
         model = model.xpu()
         model = XPUDataParallel(model, device_ids=config.gpu_ids)
     elif is_hpu_available() and config.get("gpu_ids", []):
-        model = model.hpu()
+        model = model.to("hpu")
         model = HPUDataParallel(model, device_ids=config.gpu_ids)
     elif torch.cuda.is_available() and config.get("gpu_ids", []):
         if distributed:
@@ -140,9 +140,10 @@ def val_step(self, *inputs, **kwargs):
 
 
 class HPUDataParallel(MMDataParallel):
-    def __init__(self, *args, enable_autocast: bool = False, **kwargs):
+    def __init__(self, *args, enable_autocast: bool = False, put_gt_on_device=True, **kwargs):
         super().__init__(*args, **kwargs)
         self.enable_autocast = enable_autocast
+        self.put_gt_on_device = put_gt_on_device
         self.src_device_obj = torch.device("hpu", self.device_ids[0])
 
     def scatter(self, inputs, kwargs, device_ids):
@@ -153,6 +154,8 @@ def scatter(self, inputs, kwargs, device_ids):
                 for val in x:
                     if isinstance(val, dict):
                         for k in val:
+                            if not self.put_gt_on_device and k.startswith("gt_"):
+                                continue
                             if isinstance(val[k], torch.Tensor):
                                 val[k] = val[k].to(self.src_device_obj)
                             elif isinstance(val[k], list):

diff --git a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
@@ -24,10 +24,12 @@
 from torchvision.ops import nms as tv_nms
 from torchvision.ops import roi_align as tv_roi_align
 
-from otx.algorithms.common.adapters.mmcv.utils import XPUDataParallel
+from otx.algorithms.common.adapters.mmcv.utils import HPUDataParallel, XPUDataParallel
+from otx.algorithms.common.adapters.mmcv.utils.hpu_optimizers import HABANA_OPTIMIZERS
 
 ext_module = ext_loader.load_ext("_ext", ["nms", "softnms", "nms_match", "nms_rotated", "nms_quadri"])
 dp_factory["xpu"] = XPUDataParallel
+dp_factory["hpu"] = HPUDataParallel
 
 
 def auto_scale_lr(cfg, distributed, logger):
@@ -119,11 +121,26 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
     elif cfg.device == "xpu":
         model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids, enable_autocast=bool(fp16_cfg))
         model.to(f"xpu:{cfg.gpu_ids[0]}")
+    elif cfg.device == "hpu":
+        import habana_frameworks.torch.core as htcore
+        from habana_frameworks.torch.utils.library_loader import load_habana_module
+
+        load_habana_module()
+        os.environ["PT_HPU_LAZY_MODE"] = "1"
+        assert len(cfg.gpu_ids) == 1
+        model = build_dp(
+            model, cfg.device, device_ids=cfg.gpu_ids, dim=0, enable_autocast=bool(fp16_cfg), put_gt_on_device=False
+        )
+        model.to(model.src_device_obj)
+        htcore.mark_step()
+        model.zero_grad()
     else:
         model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids)
 
     # build optimizer
     auto_scale_lr(cfg, distributed, logger)
+    if cfg.device == "hpu":
+        cfg.optimizer = patch_optimizer(cfg.optimizer)
     optimizer = build_optimizer(model, cfg.optimizer)
 
     if cfg.device == "xpu":
@@ -137,6 +154,14 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
         model.train()
         model, optimizer = torch.xpu.optimize(model, optimizer=optimizer, dtype=dtype)
 
+    if cfg.device == "hpu":
+        NMSop.forward = monkey_patched_xpu_nms
+        RoIAlign.forward = monkey_patched_xpu_roi_align
+        # build runner
+        if cfg.device == "hpu":
+            if (new_type := "Fused" + cfg.optimizer.get("type", "SGD")) in HABANA_OPTIMIZERS:
+                cfg.optimizer["type"] = new_type
+
     runner = build_runner(
         cfg.runner, default_args=dict(model=model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)
     )
@@ -198,6 +223,18 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
     runner.run(data_loaders, cfg.workflow)
 
 
+def patch_optimizer(cfg_optim):
+    """Patch optimizer for OD and IS."""
+    if cfg_optim["type"] == "SGD":
+        return cfg_optim
+
+    # Only SGD for OD and IS supported by now on HPU
+    cfg_optim["type"] = "SGD"
+    if "betas" in cfg_optim:
+        del cfg_optim["betas"]
+    return cfg_optim
+
+
 def monkey_patched_xpu_nms(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num):
     """Runs MMCVs NMS with torchvision.nms, or forces NMS from MMCV to run on CPU."""
     is_filtering_by_score = score_threshold > 0

diff --git a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
@@ -15,13 +15,15 @@
 # and limitations under the License.
 
 import multiprocessing as mp
+import time
 from typing import Dict, List, Tuple, Union
 
 import mmcv
 import numpy as np
 import pycocotools.mask as mask_util
 from mmcv.utils import print_log
 from mmdet.core import BitmapMasks, PolygonMasks, eval_map
+from mmdet.core.evaluation import mean_ap
 from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
 from mmdet.core.evaluation.class_names import get_classes
 from mmdet.core.evaluation.mean_ap import average_precision
@@ -59,6 +61,7 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
     if scale_ranges is not None:
         assert len(scale_ranges) == num_scales
 
+    segmentation = "miou" in results
     num_classes = len(results)
 
     recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
@@ -69,7 +72,8 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
         if cls_result["recall"].size > 0:
             recalls[:, i] = np.array(cls_result["recall"], ndmin=2)[:, -1]
         aps[:, i] = cls_result["ap"]
-        mious[:, i] = cls_result["miou"]
+        if segmentation:
+            mious[:, i] = cls_result["miou"]
         num_gts[:, i] = cls_result["num_gts"]
 
     if dataset is None:
@@ -82,7 +86,9 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
     if not isinstance(mean_ap, list):
         mean_ap = [mean_ap]
 
-    header = ["class", "gts", "dets", "recall", "ap", "miou"]
+    header = ["class", "gts", "dets", "recall", "ap"]
+    if segmentation:
+        header.append("miou")
     for i in range(num_scales):
         if scale_ranges is not None:
             print_log(f"Scale range {scale_ranges[i]}", logger=logger)
@@ -94,12 +100,19 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
                 results[j]["num_dets"],
                 f"{recalls[i, j]:.3f}",
                 f"{aps[i, j]:.3f}",
-                f"{mious[i, j]:.3f}",
             ]
+            if segmentation:
+                row_data.append(f"{mious[i, j]:.3f}")
             table_data.append(row_data)
-        table_data.append(["mAP", "", "", "", f"{mean_ap[i]:.3f}", f"{np.mean(mious[i]):.3f}"])
+        table_ = (
+            ["mAP", "", "", "", f"{mean_ap[i]:.3f}", f"{np.mean(mious[i]):.3f}"]
+            if segmentation
+            else ["mAP", "", "", "", f"{mean_ap[i]:.3f}"]
+        )
+        table_data.append(table_)
         table = AsciiTable(table_data)
         table.inner_footing_row_border = True
+        time.sleep(0.1)  # prevent segmentation fault
         print_log("\n" + table.table, logger=logger)
 
 
@@ -245,6 +258,7 @@ def __init__(self, annotation: List[Dict], domain: Domain, classes: List[str], n
         else:
             self.annotation = annotation
         self.nproc = nproc
+        mean_ap.print_map_summary = print_map_summary
 
     def get_gt_instance_masks(self, annotation: List[Dict]):
         """Format ground truth instance mask annotation.

diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
@@ -38,7 +38,6 @@ class CustomSingleStageDetector(SAMDetectorMixin, DetLossDynamicsTrackingMixin,
 
     def __init__(self, *args, task_adapt=None, **kwargs):
         super().__init__(*args, **kwargs)
-
         # Hook for class-sensitive weight loading
         if task_adapt:
             self._register_load_state_dict_pre_hook(

diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py
@@ -10,6 +10,7 @@
 from .custom_fcn_mask_head import CustomFCNMaskHead
 from .custom_retina_head import CustomRetinaHead
 from .custom_roi_head import CustomRoIHead
+from .custom_rpn_head import CustomRPNHead
 from .custom_ssd_head import CustomSSDHead
 from .custom_vfnet_head import CustomVFNetHead
 from .custom_yolox_head import CustomYOLOXHead
@@ -27,6 +28,7 @@
     "CustomVFNetHead",
     "CustomYOLOXHead",
     "DETRHeadExtension",
+    "CustomRPNHead",
     # Loss dynamics tracking
     "CustomATSSHeadTrackingLossDynamics",
 ]
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py
@@ -49,6 +49,32 @@ def __init__(self, *args, bg_loss_weight=-1.0, use_qfl=False, qfl_cfg=None, **kw
         self.bg_loss_weight = bg_loss_weight
         self.use_qfl = use_qfl
 
+    def forward_single(self, x, scale):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale
+                    level, the channels number is num_anchors * 4.
+                centerness (Tensor): Centerness for a single scale level, the
+                    channel number is (N, num_anchors * 1, H, W).
+        """
+        cls_score, bbox_pred, centerness = super().forward_single(x, scale)
+        if cls_score.device.type == "hpu":
+            # put further post-processing on cpu
+            cls_score = cls_score.cpu()
+            bbox_pred = bbox_pred.cpu()
+            centerness = centerness.cpu()
+
+        return cls_score, bbox_pred, centerness
+
     @force_fp32(apply_to=("cls_scores", "bbox_preds", "centernesses"))
     def loss(self, cls_scores, bbox_preds, centernesses, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None):
         """Compute losses of the head.

diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
@@ -54,6 +54,14 @@ def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels, img_met
         bbox_results.update(loss_bbox=loss_bbox)
         return bbox_results
 
+    def _mask_forward(self, x, rois=None, pos_inds=None, bbox_feats=None):
+        """Mask head forward function used in both training and testing."""
+        mask_results = super()._mask_forward(x, rois, pos_inds, bbox_feats)
+        if mask_results["mask_pred"].device.type == "hpu":
+            mask_results["mask_pred"] = mask_results["mask_pred"].cpu()
+            mask_results["mask_feats"] = mask_results["mask_feats"].cpu()
+        return mask_results
+
 
 @HEADS.register_module()
 class CustomConvFCBBoxHead(Shared2FCBBoxHead, CrossDatasetDetectorHead):
@@ -125,6 +133,16 @@ def get_targets(self, sampling_results, gt_bboxes, gt_labels, img_metas, rcnn_tr
             valid_label_mask = torch.cat(valid_label_mask, 0)
         return labels, label_weights, bbox_targets, bbox_weights, valid_label_mask
 
+    def forward(self, x):
+        """ConvFCBBoxHead forward."""
+        # shared part
+        cls_score, bbox_pred = super().forward(x)
+        if cls_score.device.type == "hpu":
+            cls_score = cls_score.cpu()
+            bbox_pred = bbox_pred.cpu()
+
+        return cls_score, bbox_pred
+
     @force_fp32(apply_to=("cls_score", "bbox_pred"))
     def loss(
         self,

diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py
@@ -0,0 +1,25 @@
+"""Custom RPN head for OTX template."""
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+from mmdet.models.builder import HEADS
+from mmdet.models.dense_heads import RPNHead
+
+
+@HEADS.register_module()
+class CustomRPNHead(RPNHead):
+    """RPN head.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+        num_convs (int): Number of convolution layers in the head. Default 1.
+    """
+
+    def forward_single(self, x):
+        """Forward feature map of a single scale level."""
+        rpn_cls_score, rpn_bbox_pred = super().forward_single(x)
+        if rpn_cls_score.device.type == "hpu":
+            rpn_cls_score = rpn_cls_score.cpu()
+            rpn_bbox_pred = rpn_bbox_pred.cpu()
+        return rpn_cls_score, rpn_bbox_pred
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py
@@ -81,6 +81,35 @@ def _init_layers(self):
                     nn.Conv2d(in_channel, num_base_priors * self.cls_out_channels, kernel_size=3, padding=1)
                 )
 
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * 4.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for feat, reg_conv, cls_conv in zip(feats, self.reg_convs, self.cls_convs):
+            cls_out = cls_conv(feat)
+            reg_out = reg_conv(feat)
+            if cls_out.device.type == "hpu":
+                cls_scores.append(cls_out.cpu())
+                bbox_preds.append(reg_out.cpu())
+            else:
+                cls_scores.append(cls_out)
+                bbox_preds.append(reg_out)
+        return cls_scores, bbox_preds
+
     def loss_single(
         self,
         cls_score,

diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
@@ -103,6 +103,16 @@ def loss(self, cls_scores, bbox_preds, objectnesses, gt_bboxes, gt_labels, img_m
 
         return loss_dict
 
+    def forward_single(self, x, cls_convs, reg_convs, conv_cls, conv_reg, conv_obj):
+        """Forward feature of a single scale level."""
+        cls_score, bbox_pred, objectness = super().forward_single(x, cls_convs, reg_convs, conv_cls, conv_reg, conv_obj)
+        if cls_score.device.type == "hpu":
+            # put on cpu for further post-processing
+            cls_score = cls_score.cpu()
+            bbox_pred = bbox_pred.cpu()
+            objectness = objectness.cpu()
+        return cls_score, bbox_pred, objectness
+
 
 @HEADS.register_module()
 class CustomYOLOXHeadTrackingLossDynamics(TrackingLossDynamicsMixIn, CustomYOLOXHead):
@@ -245,6 +255,7 @@ def _get_target_single(self, cls_preds, objectness, priors, decoded_bboxes, gt_b
         num_priors = priors.size(0)
         num_gts = gt_labels.size(0)
         gt_bboxes = gt_bboxes.to(decoded_bboxes.dtype)
+
         # No target
         if num_gts == 0:
             cls_target = cls_preds.new_zeros((0, self.num_classes))

diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/model.py b/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/model.py
@@ -28,7 +28,7 @@
     type="CustomMaskRCNN",  # Use CustomMaskRCNN for Incremental Learning
     neck=dict(type="FPN", in_channels=[24, 48, 120, 352], out_channels=80, num_outs=5),
     rpn_head=dict(
-        type="RPNHead",
+        type="CustomRPNHead",
         in_channels=80,
         feat_channels=80,
         anchor_generator=dict(type="AnchorGenerator", scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]),

diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py b/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py
@@ -38,7 +38,7 @@
     ),
     neck=dict(type="FPN", in_channels=[96, 192, 384, 768], out_channels=256, num_outs=5),
     rpn_head=dict(
-        type="RPNHead",
+        type="CustomRPNHead",
         in_channels=256,
         feat_channels=256,
         anchor_generator=dict(type="AnchorGenerator", scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]),