From 5dfe5181ecd2e6e38d4b309ab0b4c4e02e5be04e Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Tue, 31 Oct 2023 16:04:08 +0000
Subject: [PATCH 01/16] added support for OD on habana

---
 .../common/adapters/mmcv/configurer.py        | 17 ++++
 .../common/adapters/mmcv/hooks/__init__.py    |  3 +
 .../utils/_builder_build_data_parallel.py     | 85 +++++++++++++++++++
 src/otx/algorithms/common/utils/__init__.py   |  1 +
 src/otx/algorithms/common/utils/utils.py      |  1 +
 .../detection/adapters/mmdet/apis/train.py    | 40 ++++++++-
 .../adapters/mmdet/evaluation/evaluator.py    |  4 +-
 .../detectors/custom_single_stage_detector.py | 38 ++++++++-
 .../mmdet/models/heads/custom_ssd_head.py     | 34 ++++++++
 .../configs/detection/configuration.yaml      |  2 +-
 .../detection/cspdarknet_yolox_x/model.py     |  2 +-
 .../cspdarknet_yolox_x/template.yaml          |  4 +-
 .../detection/mobilenetv2_ssd/model.py        |  1 +
 .../detection/mobilenetv2_ssd/template.yaml   |  2 +-
 .../recipes/stages/detection/incremental.py   | 34 ++++----
 15 files changed, 241 insertions(+), 27 deletions(-)

diff --git a/src/otx/algorithms/common/adapters/mmcv/configurer.py b/src/otx/algorithms/common/adapters/mmcv/configurer.py
index 54ee9326b80..3d0e936aef2 100644
--- a/src/otx/algorithms/common/adapters/mmcv/configurer.py
+++ b/src/otx/algorithms/common/adapters/mmcv/configurer.py
@@ -176,7 +176,11 @@ def configure_device(self, cfg):
         elif "gpu_ids" not in cfg:
             cfg.gpu_ids = range(1)
 
+<<<<<<< HEAD
         # consider "cuda", "hpu" and "cpu" device only
+=======
+        # consider "cuda", "xpu", "hpu" and "cpu" device only
+>>>>>>> added support for OD on habana
         if is_hpu_available():
             cfg.device = "hpu"
         elif torch.cuda.is_available():
@@ -184,7 +188,10 @@ def configure_device(self, cfg):
         elif is_xpu_available():
             try:
                 import intel_extension_for_pytorch as ipex  # noqa: F401
+<<<<<<< HEAD
 
+=======
+>>>>>>> added support for OD on habana
                 cfg.device = "xpu"
             except ModuleNotFoundError:
                 cfg.device = "cpu"
@@ -263,6 +270,7 @@ def configure_fp16(cfg: Config):
         distributed = getattr(cfg, "distributed", False)
         opts: Dict[str, Any] = {}
         if fp16_config is not None:
+<<<<<<< HEAD
             if is_hpu_available():
                 if optim_type == "SAMOptimizerHook":
                     # TODO (sungchul): consider SAM optimizer
@@ -270,6 +278,9 @@ def configure_fp16(cfg: Config):
                 opts["type"] = "HPUOptimizerHook"
                 cfg.optimizer_config.update(opts)
             elif torch.cuda.is_available() or is_xpu_available():
+=======
+            if torch.cuda.is_available() or is_xpu_available():
+>>>>>>> added support for OD on habana
                 opts.update({"distributed": distributed, **fp16_config})
                 if optim_type == "SAMOptimizerHook":
                     opts["type"] = "Fp16SAMOptimizerHook"
@@ -281,6 +292,12 @@ def configure_fp16(cfg: Config):
                     cfg.fp16 = fp16_config
                     opts = dict()
                 cfg.optimizer_config.update(opts)
+            elif is_hpu_available():
+                if optim_type == "SAMOptimizerHook":
+                    # TODO (sungchul): consider SAM optimizer
+                    logger.warning("SAMOptimizerHook is not supported on HPU. Changed to OptimizerHook.")
+                opts["type"] = "HPUOptimizerHook"
+                cfg.optimizer_config.update(opts)
             else:
                 logger.info("Revert FP16 to FP32 on CPU device")
 
diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/__init__.py b/src/otx/algorithms/common/adapters/mmcv/hooks/__init__.py
index a7c41d80fee..a6dd4ea965c 100644
--- a/src/otx/algorithms/common/adapters/mmcv/hooks/__init__.py
+++ b/src/otx/algorithms/common/adapters/mmcv/hooks/__init__.py
@@ -52,6 +52,7 @@
 from .semisl_cls_hook import SemiSLClsHook
 from .task_adapt_hook import TaskAdaptHook
 from .two_crop_transform_hook import TwoCropTransformHook
+from .hpu_optimizer_hook import HPUOptimizerHook, HPUDistOptimizerHook
 
 __all__ = [
     "AdaptiveRepeatDataHook",
@@ -90,6 +91,8 @@
     "MeanTeacherHook",
     "MemCacheHook",
     "LossDynamicsTrackingHook",
+    "HPUOptimizerHook",
+    "HPUDistOptimizerHook",
 ]
 
 try:
diff --git a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
index 39c9bf5f7b3..206316db800 100644
--- a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
+++ b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
@@ -12,7 +12,13 @@
 from mmcv import Config
 from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
 
+<<<<<<< HEAD
 from otx.algorithms.common.utils import is_hpu_available, is_xpu_available
+=======
+from otx.algorithms.common.utils import is_xpu_available, is_hpu_available
+import habana_frameworks.torch as htorch
+from torch._utils import _get_device_index
+>>>>>>> added support for OD on habana
 
 
 @overload
@@ -138,12 +144,63 @@ def val_step(self, *inputs, **kwargs):
         with torch.autocast(device_type="xpu", dtype=torch.bfloat16, enabled=self.enable_autocast):
             return super().val_step(*inputs, **kwargs)
 
+<<<<<<< HEAD
 
 class HPUDataParallel(MMDataParallel):
     def __init__(self, *args, enable_autocast: bool = False, **kwargs):
         super().__init__(*args, **kwargs)
         self.enable_autocast = enable_autocast
         self.src_device_obj = torch.device("hpu", self.device_ids[0])
+=======
+def _get_available_device_type():
+    if torch.cuda.is_available():
+        return "cuda"
+    if hasattr(torch, "xpu") and torch.xpu.is_available():  # type: ignore[attr-defined]
+        return "xpu"
+    if is_hpu_available():
+        return "hpu"
+    # add more available device types here
+    return None
+
+
+def _get_device_attr(get_member):
+    device_type = _get_available_device_type()
+    if device_type and device_type.lower() == "cuda":
+        return get_member(torch.cuda)
+    if device_type and device_type.lower() == "xpu":
+        return get_member(torch.xpu)  # type: ignore[attr-defined]
+    if device_type and device_type.lower() == "hpu":
+        return get_member(htorch.hpu)
+    # add more available device types here
+    return None
+
+
+def _get_all_device_indices():
+    # all device index
+    return _get_device_attr(lambda m: list(range(m.device_count())))
+
+
+class HPUDataParallel(MMDataParallel):
+    def __init__(self, module, device_ids=None, output_device=None, dim=0, is_autocast=True):
+        super().__init__(module=module)
+        device_type = _get_available_device_type()
+        if device_type is None:
+            self.module = module
+            self.device_ids = []
+            return
+
+        if device_ids is None:
+            device_ids = _get_all_device_indices()
+
+        if output_device is None:
+            output_device = device_ids[0]
+
+        self.dim = dim
+        self.device_ids = [_get_device_index(x, True) for x in device_ids]
+        self.output_device = _get_device_index(output_device, True)
+        self.src_device_obj = torch.device(device_type, self.device_ids[0])
+        self.is_autocast = is_autocast
+>>>>>>> added support for OD on habana
 
     def scatter(self, inputs, kwargs, device_ids):
         inputs, kwargs = super().scatter(inputs, kwargs, [-1])
@@ -154,25 +211,42 @@ def scatter(self, inputs, kwargs, device_ids):
                     if isinstance(val, dict):
                         for k in val:
                             if isinstance(val[k], torch.Tensor):
+<<<<<<< HEAD
                                 val[k] = val[k].to(self.src_device_obj)
                             elif isinstance(val[k], list):
                                 for i, item in enumerate(val[k]):
                                     if isinstance(item, torch.Tensor):
                                         val[k][i] = item.to(self.src_device_obj)
+=======
+                                val[k] = val[k].to(torch.device(f"hpu:{device_ids[0]}"))
+                            elif isinstance(val[k], list):
+                                for i, item in enumerate(val[k]):
+                                    if isinstance(item, torch.Tensor):
+                                        val[k][i] = item.to(torch.device(f"hpu:{device_ids[0]}"))
+>>>>>>> added support for OD on habana
 
         for x in kwargs:
             if isinstance(x, dict):
                 for k in x:
                     if isinstance(x[k], torch.Tensor):
+<<<<<<< HEAD
                         x[k] = x[k].to(f"hpu:{device_ids[0]}")
                     elif isinstance(x[k], list):
                         for i, item in enumerate(x[k]):
                             if isinstance(item, torch.Tensor):
                                 x[k][i] = item.to(self.src_device_obj)
+=======
+                        x[k] = x[k].to("hpu")
+                    elif isinstance(x[k], list):
+                        for i, item in enumerate(x[k]):
+                            if isinstance(item, torch.Tensor):
+                                x[k][i] = item.to(torch.device(f"hpu:{device_ids[0]}"))
+>>>>>>> added support for OD on habana
 
         return inputs, kwargs
 
     def forward(self, *inputs, **kwargs):
+<<<<<<< HEAD
         with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=self.enable_autocast):
             return super().forward(*inputs, **kwargs)
 
@@ -182,4 +256,15 @@ def train_step(self, *inputs, **kwargs):
 
     def val_step(self, *inputs, **kwargs):
         with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=self.enable_autocast):
+=======
+        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.is_autocast):
+            return super().forward(*inputs, **kwargs)
+
+    def train_step(self, *inputs, **kwargs):
+        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.is_autocast):
+            return super().train_step(*inputs, **kwargs)
+
+    def val_step(self, *inputs, **kwargs):
+        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.is_autocast):
+>>>>>>> added support for OD on habana
             return super().val_step(*inputs, **kwargs)
diff --git a/src/otx/algorithms/common/utils/__init__.py b/src/otx/algorithms/common/utils/__init__.py
index 6395bd6e60d..23bb01ed20f 100644
--- a/src/otx/algorithms/common/utils/__init__.py
+++ b/src/otx/algorithms/common/utils/__init__.py
@@ -30,6 +30,7 @@
     get_task_class,
     is_hpu_available,
     is_xpu_available,
+    is_hpu_available,
     load_template,
     read_py_config,
     set_random_seed,
diff --git a/src/otx/algorithms/common/utils/utils.py b/src/otx/algorithms/common/utils/utils.py
index 92e1b2f0853..c9b651244a2 100644
--- a/src/otx/algorithms/common/utils/utils.py
+++ b/src/otx/algorithms/common/utils/utils.py
@@ -17,6 +17,7 @@
 import torch
 import yaml
 from addict import Dict as adict
+import habana_frameworks.torch as htorch
 
 HPU_AVAILABLE = None
 try:
diff --git a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
index caf8720b59a..d3f49219a17 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
@@ -23,11 +23,15 @@
 from mmdet.utils.util_distribution import build_dp, dp_factory
 from torchvision.ops import nms as tv_nms
 from torchvision.ops import roi_align as tv_roi_align
+from torch.profiler import profile, record_function, ProfilerActivity
 
-from otx.algorithms.common.adapters.mmcv.utils import XPUDataParallel
+from habana_frameworks.torch.utils.library_loader import load_habana_module
+from otx.algorithms.common.adapters.mmcv.utils import XPUDataParallel, HPUDataParallel
 
 ext_module = ext_loader.load_ext("_ext", ["nms", "softnms", "nms_match", "nms_rotated", "nms_quadri"])
 dp_factory["xpu"] = XPUDataParallel
+dp_factory["hpu"] = HPUDataParallel
+load_habana_module()
 
 
 def auto_scale_lr(cfg, distributed, logger):
@@ -119,6 +123,15 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
     elif cfg.device == "xpu":
         model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids, enable_autocast=bool(fp16_cfg))
         model.to(f"xpu:{cfg.gpu_ids[0]}")
+    elif cfg.device == "hpu":
+        import habana_frameworks.torch.core as htcore
+        os.environ["PT_HPU_LAZY_MODE"] = "1"
+        assert len(cfg.gpu_ids) == 1
+        # CHECK IT
+        model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids, dim=0, is_autocast=bool(fp16_cfg))
+        # model = HPUDataParallel(model, dim=0, device_ids=cfg.gpu_ids, is_autocast=bool(fp16_cfg))
+        model.to(f"hpu:{cfg.gpu_ids[0]}")
+        htcore.mark_step()
     else:
         model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids)
 
@@ -137,6 +150,30 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
         model.train()
         model, optimizer = torch.xpu.optimize(model, optimizer=optimizer, dtype=dtype)
 
+    if cfg.device == "hpu":
+        NMSop.forward = monkey_patched_xpu_nms
+        RoIAlign.forward = monkey_patched_xpu_roi_align
+        from otx.algorithms.common.adapters.mmcv.optimizer.hpu_optimizer import register_habana_optimizers
+        habana_optimizers = register_habana_optimizers()
+        if (new_type := "Fused" + cfg.optimizer.get("type", "SGD")) in habana_optimizers:
+            cfg.optimizer["type"] = new_type
+    # activities = [torch.profiler.ProfilerActivity.CPU]
+    # activities.append(torch.profiler.ProfilerActivity.HPU)
+    # for epoch in range(10):
+    #     for det_out in data_loaders[0]:
+    #         img = det_out["img"].data[-1].to(torch.device("hpu"))
+    #         img_metas = det_out["img_metas"].data[-1]
+    #         gt_bboxes = [bbox.to(torch.device("hpu")) for bbox in det_out["gt_bboxes"].data[-1]]
+    #         gt_labels = [label.to(torch.device("hpu")) for label in det_out["gt_labels"].data[-1]]
+    #         with torch.profiler.profile(
+    #                 # schedule=torch.profiler.schedule(wait=0, warmup=20, active=5, repeat=1),
+    #                 activities=activities,
+    #                 on_trace_ready=torch.profiler.tensorboard_trace_handler('logs')) as profiler:
+    #             model.module.forward_train(img, img_metas, gt_bboxes, gt_labels)
+    #         print(profiler.key_averages().table())
+    #         # print(losses)
+    #         breakpoint()
+
     runner = build_runner(
         cfg.runner, default_args=dict(model=model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)
     )
@@ -220,7 +257,6 @@ def monkey_patched_xpu_nms(ctx, bboxes, scores, iou_threshold, offset, score_thr
         inds = ext_module.nms(bboxes, scores, iou_threshold=float(iou_threshold), offset=offset)
         bboxes = bboxes.to(device)
         scores = scores.to(device)
-
     if max_num > 0:
         inds = inds[:max_num]
     if is_filtering_by_score:
diff --git a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
index b6e5e6ab2dd..4fbf6a9da7f 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
@@ -60,7 +60,6 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
         assert len(scale_ranges) == num_scales
 
     num_classes = len(results)
-
     recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
     aps = np.zeros((num_scales, num_classes), dtype=np.float32)
     num_gts = np.zeros((num_scales, num_classes), dtype=int)
@@ -376,7 +375,7 @@ def evaluate(self, results, logger, iou_thr, scale_ranges):
             metric: mAP and mIoU metric
         """
         if self.domain == Domain.DETECTION:
-            return eval_map(
+            output = eval_map(
                 results,
                 self.annotation,
                 scale_ranges=scale_ranges,
@@ -384,4 +383,5 @@ def evaluate(self, results, logger, iou_thr, scale_ranges):
                 dataset=self.classes,
                 logger=logger,
             )
+            return output
         return self.evaluate_mask(results, logger, iou_thr)
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
index f690c38d86b..702da56c428 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
@@ -6,8 +6,10 @@
 import functools
 
 import torch
+import time
 from mmdet.models.builder import DETECTORS
 from mmdet.models.detectors.single_stage import SingleStageDetector
+from mmdet.core import bbox2result
 
 from otx.algorithms.common.adapters.mmcv.hooks.recording_forward_hook import (
     FeatureVectorHook,
@@ -34,7 +36,7 @@
 class CustomSingleStageDetector(SAMDetectorMixin, DetLossDynamicsTrackingMixin, L2SPDetectorMixin, SingleStageDetector):
     """SAM optimizer & L2SP regularizer enabled custom SSD."""
 
-    TRACKING_LOSS_TYPE = (TrackingLossType.cls, TrackingLossType.bbox)
+    # TRACKING_LOSS_TYPE = (TrackingLossType.cls, TrackingLossType.bbox)
 
     def __init__(self, *args, task_adapt=None, **kwargs):
         super().__init__(*args, **kwargs)
@@ -74,10 +76,44 @@ def forward_train(self, img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=N
         batch_input_shape = tuple(img[0].size()[-2:])
         for img_meta in img_metas:
             img_meta["batch_input_shape"] = batch_input_shape
+        ttt = time.time()
         x = self.extract_feat(img)
+        print("extract_feat", time.time() - ttt)
         losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore, **kwargs)
         return losses
 
+    def simple_test(self, img, img_metas, rescale=False):
+        """Test function without test-time augmentation.
+
+        Args:
+            img (torch.Tensor): Images with shape (N, C, H, W).
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        feat = self.extract_feat(img)
+        results_list = self.bbox_head.simple_test(
+            feat, img_metas, rescale=rescale)
+
+        # bbox_results = []
+        # for det_bboxes, det_labels in results_list:
+        #     if det_bboxes.dtype == torch.bfloat16:
+        #         det_bboxes = det_bboxes.to(torch.float32)
+        #         det_labels = det_labels.to(torch.float32)
+        #         bbox_results.append(bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes))
+        #     else:
+        #         bbox_results.append(bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes))
+        bbox_results = [
+            bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
+            for det_bboxes, det_labels in results_list
+        ]
+        return bbox_results
+
     @staticmethod
     def load_state_dict_pre_hook(model, model_classes, chkpt_classes, chkpt_dict, prefix, *args, **kwargs):
         """Modify input state_dict according to class name matching before weight loading."""
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py
index 6d5f1ce8427..42f37571457 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py
@@ -13,6 +13,7 @@
 from mmdet.models.dense_heads.ssd_head import SSDHead
 from mmdet.models.losses import smooth_l1_loss
 from torch import nn
+import time
 
 from otx.algorithms.detection.adapters.mmdet.models.heads.cross_dataset_detector_head import TrackingLossDynamicsMixIn
 from otx.algorithms.detection.adapters.mmdet.models.loss_dyns import (
@@ -81,6 +82,32 @@ def _init_layers(self):
                     nn.Conv2d(in_channel, num_base_priors * self.cls_out_channels, kernel_size=3, padding=1)
                 )
 
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * 4.
+        """
+        cls_scores = []
+        bbox_preds = []
+        start = time.time()
+        for feat, reg_conv, cls_conv in zip(feats, self.reg_convs,
+                                            self.cls_convs):
+            cls_scores.append(cls_conv(feat))
+            bbox_preds.append(reg_conv(feat))
+        print("bbox_head_forward:  ", time.time() - start)
+        return cls_scores, bbox_preds
+
     def loss_single(
         self,
         cls_score,
@@ -118,6 +145,7 @@ def loss_single(
         """
 
         # Re-weigting BG loss
+        start1 = time.time()
         label_weights = label_weights.reshape(-1)
         if self.bg_loss_weight >= 0.0:
             neg_indices = labels == self.num_classes
@@ -125,6 +153,7 @@ def loss_single(
             label_weights[neg_indices] = self.bg_loss_weight
 
         loss_cls_all = self.loss_cls(cls_score, labels, label_weights)
+        print("loss_cls_all:  ", time.time() - start1)
         if len(loss_cls_all.shape) > 1:
             loss_cls_all = loss_cls_all.sum(-1)
         # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
@@ -146,7 +175,10 @@ def loss_single(
 
         # TODO: We need to verify that this is working properly.
         # pylint: disable=redundant-keyword-arg
+        start = time.time()
         loss_bbox = self._get_loss_bbox(bbox_pred, bbox_targets, bbox_weights, num_total_samples)
+        print("loss_bbox:  ", time.time() - start)
+        print("loss_single:  ", time.time() - start1)
         return loss_cls[None], loss_bbox
 
     def _get_pos_inds(self, labels):
@@ -172,7 +204,9 @@ def _get_loss_cls(self, num_total_samples, loss_cls_all, pos_inds, topk_loss_cls
 
     def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None):
         """Loss function."""
+        start = time.time()
         losses = super().loss(cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore)
+        print("loss_ALL:  ", time.time() - start)
         losses_cls = losses["loss_cls"]
         losses_bbox = losses["loss_bbox"]
 
diff --git a/src/otx/algorithms/detection/configs/detection/configuration.yaml b/src/otx/algorithms/detection/configs/detection/configuration.yaml
index d36b0d941bc..7fb866e16df 100644
--- a/src/otx/algorithms/detection/configs/detection/configuration.yaml
+++ b/src/otx/algorithms/detection/configs/detection/configuration.yaml
@@ -129,7 +129,7 @@ learning_parameters:
     warning: null
   enable_early_stopping:
     affects_outcome_of: TRAINING
-    default_value: true
+    default_value: false
     description: Early exit from training when validation accuracy isn't changed or decreased for several epochs.
     editable: true
     header: Enable early stopping of the training
diff --git a/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/model.py b/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/model.py
index 857021810d1..e734996e4b1 100644
--- a/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/model.py
+++ b/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/model.py
@@ -20,5 +20,5 @@
 load_from = "https://download.openmmlab.com/mmdetection/v2.0/yolox\
 /yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth"
 
-fp16 = dict(loss_scale=512.0)
+fp16 = None
 ignore = False
diff --git a/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/template.yaml b/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/template.yaml
index 50e07835a96..f59b60c12b1 100644
--- a/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/template.yaml
+++ b/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/template.yaml
@@ -26,10 +26,10 @@ hyper_parameters:
   parameter_overrides:
     learning_parameters:
       batch_size:
-        default_value: 4
+        default_value: 16
         auto_hpo_state: POSSIBLE
       inference_batch_size:
-        default_value: 4
+        default_value: 16
       learning_rate:
         default_value: 0.001
         auto_hpo_state: POSSIBLE
diff --git a/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/model.py b/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/model.py
index 45847b0b80c..bc4eff90aa2 100644
--- a/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/model.py
+++ b/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/model.py
@@ -96,4 +96,5 @@
 /models/object_detection/v2/mobilenet_v2-2s_ssd-992x736.pth"
 
 fp16 = dict(loss_scale=512.0)
+# fp16 = None
 ignore = False
diff --git a/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/template.yaml b/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/template.yaml
index 7b517542b35..3e8768caad4 100644
--- a/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/template.yaml
+++ b/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/template.yaml
@@ -36,7 +36,7 @@ hyper_parameters:
       learning_rate_warmup_iters:
         default_value: 3
       num_iters:
-        default_value: 200
+        default_value: 50
     nncf_optimization:
       enable_quantization:
         default_value: true
diff --git a/src/otx/recipes/stages/detection/incremental.py b/src/otx/recipes/stages/detection/incremental.py
index 9ddd2e28e55..a78cea85e93 100644
--- a/src/otx/recipes/stages/detection/incremental.py
+++ b/src/otx/recipes/stages/detection/incremental.py
@@ -21,28 +21,28 @@
         interval=1,
         priority=75,
     ),
-    dict(
-        type="EMAHook",
-        priority="ABOVE_NORMAL",
-        momentum=0.1,
-    ),
+    # dict(
+    #     type="EMAHook",
+    #     priority="ABOVE_NORMAL",
+    #     momentum=0.1,
+    # ),
 ]
 
-lr_config = dict(
-    policy="ReduceLROnPlateau",
-    metric="mAP",
-    patience=5,
-    iteration_patience=0,
-    interval=1,
-    min_lr=1e-06,
-    warmup="linear",
-    warmup_iters=200,
-    warmup_ratio=0.3333333333333333,
-)
+# lr_config = dict(
+#     policy="ReduceLROnPlateau",
+#     metric="mAP",
+#     patience=5,
+#     iteration_patience=0,
+#     interval=1,
+#     min_lr=1e-06,
+#     warmup="linear",
+#     warmup_iters=200,
+#     warmup_ratio=0.3333333333333333,
+# )
 
 ignore = True
 adaptive_validation_interval = dict(
     max_interval=5,
-    enable_adaptive_interval_hook=True,
+    enable_adaptive_interval_hook=False,
     enable_eval_before_run=True,
 )

From 1f04dc0b56a883f4ae00156d849fe3f17b6f3fa6 Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Tue, 31 Oct 2023 16:05:36 +0000
Subject: [PATCH 02/16] added hpu_opt

---
 .../common/adapters/mmcv/hooks/hpu_optimizer_hook.py       | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py b/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py
index f5e26c49083..13eef26de2d 100644
--- a/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py
+++ b/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py
@@ -4,8 +4,15 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
+<<<<<<< HEAD
 import habana_frameworks.torch.core as htcore
 from mmcv.runner import HOOKS, OptimizerHook
+=======
+from mmcv.runner import HOOKS, OptimizerHook
+from mmcls.core import DistOptimizerHook
+import time
+import habana_frameworks.torch.core as htcore
+>>>>>>> added hpu_opt
 
 
 @HOOKS.register_module()

From b61cea7f1a98998d418c3859db9636a3fee89e16 Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Mon, 6 Nov 2023 14:57:13 +0000
Subject: [PATCH 03/16] added OD support.

---
 .../detection/adapters/mmdet/apis/train.py    | 20 +------------------
 .../mmdet/models/heads/custom_yolox_head.py   | 13 +++++++++++-
 .../cspdarknet_yolox_x/template.yaml          |  2 +-
 3 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
index d3f49219a17..d1767756d99 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
@@ -23,7 +23,6 @@
 from mmdet.utils.util_distribution import build_dp, dp_factory
 from torchvision.ops import nms as tv_nms
 from torchvision.ops import roi_align as tv_roi_align
-from torch.profiler import profile, record_function, ProfilerActivity
 
 from habana_frameworks.torch.utils.library_loader import load_habana_module
 from otx.algorithms.common.adapters.mmcv.utils import XPUDataParallel, HPUDataParallel
@@ -127,11 +126,10 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
         import habana_frameworks.torch.core as htcore
         os.environ["PT_HPU_LAZY_MODE"] = "1"
         assert len(cfg.gpu_ids) == 1
-        # CHECK IT
         model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids, dim=0, is_autocast=bool(fp16_cfg))
-        # model = HPUDataParallel(model, dim=0, device_ids=cfg.gpu_ids, is_autocast=bool(fp16_cfg))
         model.to(f"hpu:{cfg.gpu_ids[0]}")
         htcore.mark_step()
+        model.zero_grad()
     else:
         model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids)
 
@@ -157,22 +155,6 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
         habana_optimizers = register_habana_optimizers()
         if (new_type := "Fused" + cfg.optimizer.get("type", "SGD")) in habana_optimizers:
             cfg.optimizer["type"] = new_type
-    # activities = [torch.profiler.ProfilerActivity.CPU]
-    # activities.append(torch.profiler.ProfilerActivity.HPU)
-    # for epoch in range(10):
-    #     for det_out in data_loaders[0]:
-    #         img = det_out["img"].data[-1].to(torch.device("hpu"))
-    #         img_metas = det_out["img_metas"].data[-1]
-    #         gt_bboxes = [bbox.to(torch.device("hpu")) for bbox in det_out["gt_bboxes"].data[-1]]
-    #         gt_labels = [label.to(torch.device("hpu")) for label in det_out["gt_labels"].data[-1]]
-    #         with torch.profiler.profile(
-    #                 # schedule=torch.profiler.schedule(wait=0, warmup=20, active=5, repeat=1),
-    #                 activities=activities,
-    #                 on_trace_ready=torch.profiler.tensorboard_trace_handler('logs')) as profiler:
-    #             model.module.forward_train(img, img_metas, gt_bboxes, gt_labels)
-    #         print(profiler.key_averages().table())
-    #         # print(losses)
-    #         breakpoint()
 
     runner = build_runner(
         cfg.runner, default_args=dict(model=model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
index 5de9fc272ff..8e04b37d16a 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
@@ -43,6 +43,7 @@ def loss(self, cls_scores, bbox_preds, objectnesses, gt_bboxes, gt_labels, img_m
             gt_bboxes_ignore (None | list[Tensor]): specify which bounding
                 boxes can be ignored when computing the loss.
         """
+
         num_imgs = len(img_metas)
         featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
         mlvl_priors = self.prior_generator.grid_priors(
@@ -151,6 +152,14 @@ def loss(self, cls_scores, bbox_preds, objectnesses, gt_bboxes, gt_labels, img_m
         flatten_priors = torch.cat(mlvl_priors)
         flatten_bboxes = self._bbox_decode(flatten_priors, flatten_bbox_preds)
 
+        if "hpu" in flatten_cls_preds.device:
+            # put loss computastion on CPU -> faster, avoid errors
+            flatten_cls_preds = flatten_cls_preds.cpu()
+            flatten_bbox_preds = flatten_bbox_preds.cpu()
+            flatten_objectness = flatten_objectness.cpu()
+            flatten_priors = flatten_priors.cpu()
+            flatten_bboxes = flatten_bboxes.cpu()
+
         # Init variables for loss dynamics tracking
         self.cur_batch_idx = 0
         self.max_gt_bboxes_len = max([len(gt_bbox) for gt_bbox in gt_bboxes])
@@ -219,7 +228,6 @@ def loss(self, cls_scores, bbox_preds, objectnesses, gt_bboxes, gt_labels, img_m
         if self.use_l1:
             loss_l1 = self.loss_l1(flatten_bbox_preds.view(-1, 4)[pos_masks], l1_targets) / num_total_samples
             loss_dict.update(loss_l1=loss_l1)
-
         return loss_dict
 
     @torch.no_grad()
@@ -245,6 +253,9 @@ def _get_target_single(self, cls_preds, objectness, priors, decoded_bboxes, gt_b
         num_priors = priors.size(0)
         num_gts = gt_labels.size(0)
         gt_bboxes = gt_bboxes.to(decoded_bboxes.dtype)
+        if "hpu" in gt_bboxes.device:
+            gt_bboxes = gt_bboxes.cpu()
+            gt_labels = gt_labels.cpu()
         # No target
         if num_gts == 0:
             cls_target = cls_preds.new_zeros((0, self.num_classes))
diff --git a/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/template.yaml b/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/template.yaml
index f59b60c12b1..ca1d67ef754 100644
--- a/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/template.yaml
+++ b/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/template.yaml
@@ -36,7 +36,7 @@ hyper_parameters:
       learning_rate_warmup_iters:
         default_value: 3
       num_iters:
-        default_value: 200
+        default_value: 20
     nncf_optimization:
       enable_quantization:
         default_value: true

From bcc2408753f18f44f050fd496f2a248a2a8dcfd9 Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Tue, 7 Nov 2023 12:42:03 +0000
Subject: [PATCH 04/16] optimize a bit YOLOX. Now, inference is fast. Training
 still freezes

---
 .../adapters/mmcv/hooks/hpu_optimizer_hook.py |  7 --
 .../utils/_builder_build_data_parallel.py     | 85 -------------------
 .../detection/adapters/mmdet/apis/train.py    |  5 +-
 .../adapters/mmdet/evaluation/evaluator.py    | 21 +++--
 .../mmdet/models/heads/custom_yolox_head.py   | 27 ++++--
 5 files changed, 38 insertions(+), 107 deletions(-)

diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py b/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py
index 13eef26de2d..f5e26c49083 100644
--- a/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py
+++ b/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py
@@ -4,15 +4,8 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
-<<<<<<< HEAD
 import habana_frameworks.torch.core as htcore
 from mmcv.runner import HOOKS, OptimizerHook
-=======
-from mmcv.runner import HOOKS, OptimizerHook
-from mmcls.core import DistOptimizerHook
-import time
-import habana_frameworks.torch.core as htcore
->>>>>>> added hpu_opt
 
 
 @HOOKS.register_module()
diff --git a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
index 206316db800..39c9bf5f7b3 100644
--- a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
+++ b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
@@ -12,13 +12,7 @@
 from mmcv import Config
 from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
 
-<<<<<<< HEAD
 from otx.algorithms.common.utils import is_hpu_available, is_xpu_available
-=======
-from otx.algorithms.common.utils import is_xpu_available, is_hpu_available
-import habana_frameworks.torch as htorch
-from torch._utils import _get_device_index
->>>>>>> added support for OD on habana
 
 
 @overload
@@ -144,63 +138,12 @@ def val_step(self, *inputs, **kwargs):
         with torch.autocast(device_type="xpu", dtype=torch.bfloat16, enabled=self.enable_autocast):
             return super().val_step(*inputs, **kwargs)
 
-<<<<<<< HEAD
 
 class HPUDataParallel(MMDataParallel):
     def __init__(self, *args, enable_autocast: bool = False, **kwargs):
         super().__init__(*args, **kwargs)
         self.enable_autocast = enable_autocast
         self.src_device_obj = torch.device("hpu", self.device_ids[0])
-=======
-def _get_available_device_type():
-    if torch.cuda.is_available():
-        return "cuda"
-    if hasattr(torch, "xpu") and torch.xpu.is_available():  # type: ignore[attr-defined]
-        return "xpu"
-    if is_hpu_available():
-        return "hpu"
-    # add more available device types here
-    return None
-
-
-def _get_device_attr(get_member):
-    device_type = _get_available_device_type()
-    if device_type and device_type.lower() == "cuda":
-        return get_member(torch.cuda)
-    if device_type and device_type.lower() == "xpu":
-        return get_member(torch.xpu)  # type: ignore[attr-defined]
-    if device_type and device_type.lower() == "hpu":
-        return get_member(htorch.hpu)
-    # add more available device types here
-    return None
-
-
-def _get_all_device_indices():
-    # all device index
-    return _get_device_attr(lambda m: list(range(m.device_count())))
-
-
-class HPUDataParallel(MMDataParallel):
-    def __init__(self, module, device_ids=None, output_device=None, dim=0, is_autocast=True):
-        super().__init__(module=module)
-        device_type = _get_available_device_type()
-        if device_type is None:
-            self.module = module
-            self.device_ids = []
-            return
-
-        if device_ids is None:
-            device_ids = _get_all_device_indices()
-
-        if output_device is None:
-            output_device = device_ids[0]
-
-        self.dim = dim
-        self.device_ids = [_get_device_index(x, True) for x in device_ids]
-        self.output_device = _get_device_index(output_device, True)
-        self.src_device_obj = torch.device(device_type, self.device_ids[0])
-        self.is_autocast = is_autocast
->>>>>>> added support for OD on habana
 
     def scatter(self, inputs, kwargs, device_ids):
         inputs, kwargs = super().scatter(inputs, kwargs, [-1])
@@ -211,42 +154,25 @@ def scatter(self, inputs, kwargs, device_ids):
                     if isinstance(val, dict):
                         for k in val:
                             if isinstance(val[k], torch.Tensor):
-<<<<<<< HEAD
                                 val[k] = val[k].to(self.src_device_obj)
                             elif isinstance(val[k], list):
                                 for i, item in enumerate(val[k]):
                                     if isinstance(item, torch.Tensor):
                                         val[k][i] = item.to(self.src_device_obj)
-=======
-                                val[k] = val[k].to(torch.device(f"hpu:{device_ids[0]}"))
-                            elif isinstance(val[k], list):
-                                for i, item in enumerate(val[k]):
-                                    if isinstance(item, torch.Tensor):
-                                        val[k][i] = item.to(torch.device(f"hpu:{device_ids[0]}"))
->>>>>>> added support for OD on habana
 
         for x in kwargs:
             if isinstance(x, dict):
                 for k in x:
                     if isinstance(x[k], torch.Tensor):
-<<<<<<< HEAD
                         x[k] = x[k].to(f"hpu:{device_ids[0]}")
                     elif isinstance(x[k], list):
                         for i, item in enumerate(x[k]):
                             if isinstance(item, torch.Tensor):
                                 x[k][i] = item.to(self.src_device_obj)
-=======
-                        x[k] = x[k].to("hpu")
-                    elif isinstance(x[k], list):
-                        for i, item in enumerate(x[k]):
-                            if isinstance(item, torch.Tensor):
-                                x[k][i] = item.to(torch.device(f"hpu:{device_ids[0]}"))
->>>>>>> added support for OD on habana
 
         return inputs, kwargs
 
     def forward(self, *inputs, **kwargs):
-<<<<<<< HEAD
         with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=self.enable_autocast):
             return super().forward(*inputs, **kwargs)
 
@@ -256,15 +182,4 @@ def train_step(self, *inputs, **kwargs):
 
     def val_step(self, *inputs, **kwargs):
         with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=self.enable_autocast):
-=======
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.is_autocast):
-            return super().forward(*inputs, **kwargs)
-
-    def train_step(self, *inputs, **kwargs):
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.is_autocast):
-            return super().train_step(*inputs, **kwargs)
-
-    def val_step(self, *inputs, **kwargs):
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.is_autocast):
->>>>>>> added support for OD on habana
             return super().val_step(*inputs, **kwargs)
diff --git a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
index d1767756d99..819e51f8f6b 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
@@ -126,8 +126,9 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
         import habana_frameworks.torch.core as htcore
         os.environ["PT_HPU_LAZY_MODE"] = "1"
         assert len(cfg.gpu_ids) == 1
-        model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids, dim=0, is_autocast=bool(fp16_cfg))
-        model.to(f"hpu:{cfg.gpu_ids[0]}")
+        model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids, dim=0,
+                         is_autocast=bool(fp16_cfg), put_gt_on_device=False)
+        model.to(f"hpu:{cfg.gpu_ids[0]}", non_blocking=True)
         htcore.mark_step()
         model.zero_grad()
     else:
diff --git a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
index 4fbf6a9da7f..553b9bc7543 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
@@ -16,6 +16,7 @@
 
 import multiprocessing as mp
 from typing import Dict, List, Tuple, Union
+import time
 
 import mmcv
 import numpy as np
@@ -25,12 +26,14 @@
 from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
 from mmdet.core.evaluation.class_names import get_classes
 from mmdet.core.evaluation.mean_ap import average_precision
+from mmdet.core.evaluation import mean_ap
 from terminaltables import AsciiTable
 
 from otx.api.entities.label import Domain
 from otx.api.utils.time_utils import timeit
 
 
+
 def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
     mean_ap, results, dataset=None, scale_ranges=None, logger=None
 ):
@@ -59,6 +62,7 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
     if scale_ranges is not None:
         assert len(scale_ranges) == num_scales
 
+    segmentation = "miou" in results
     num_classes = len(results)
     recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
     aps = np.zeros((num_scales, num_classes), dtype=np.float32)
@@ -68,7 +72,8 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
         if cls_result["recall"].size > 0:
             recalls[:, i] = np.array(cls_result["recall"], ndmin=2)[:, -1]
         aps[:, i] = cls_result["ap"]
-        mious[:, i] = cls_result["miou"]
+        if segmentation:
+            mious[:, i] = cls_result["miou"]
         num_gts[:, i] = cls_result["num_gts"]
 
     if dataset is None:
@@ -81,7 +86,9 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
     if not isinstance(mean_ap, list):
         mean_ap = [mean_ap]
 
-    header = ["class", "gts", "dets", "recall", "ap", "miou"]
+    header = ["class", "gts", "dets", "recall", "ap"]
+    if segmentation:
+        header.append("miou")
     for i in range(num_scales):
         if scale_ranges is not None:
             print_log(f"Scale range {scale_ranges[i]}", logger=logger)
@@ -92,13 +99,16 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
                 num_gts[i, j],
                 results[j]["num_dets"],
                 f"{recalls[i, j]:.3f}",
-                f"{aps[i, j]:.3f}",
-                f"{mious[i, j]:.3f}",
+                f"{aps[i, j]:.3f}"
             ]
+            if segmentation:
+                row_data.append(f"{mious[i, j]:.3f}")
             table_data.append(row_data)
-        table_data.append(["mAP", "", "", "", f"{mean_ap[i]:.3f}", f"{np.mean(mious[i]):.3f}"])
+        table_ = ["mAP", "", "", "", f"{mean_ap[i]:.3f}", f"{np.mean(mious[i]):.3f}"] if segmentation else ["mAP", "", "", "", f"{mean_ap[i]:.3f}"]
+        table_data.append(table_)
         table = AsciiTable(table_data)
         table.inner_footing_row_border = True
+        time.sleep(0.1) # prevent segmentation fault
         print_log("\n" + table.table, logger=logger)
 
 
@@ -244,6 +254,7 @@ def __init__(self, annotation: List[Dict], domain: Domain, classes: List[str], n
         else:
             self.annotation = annotation
         self.nproc = nproc
+        mean_ap.print_map_summary = print_map_summary
 
     def get_gt_instance_masks(self, annotation: List[Dict]):
         """Format ground truth instance mask annotation.
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
index 8e04b37d16a..69fb668e2f3 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
@@ -104,6 +104,25 @@ def loss(self, cls_scores, bbox_preds, objectnesses, gt_bboxes, gt_labels, img_m
 
         return loss_dict
 
+    def forward_single(self, x, cls_convs, reg_convs, conv_cls, conv_reg,
+                       conv_obj):
+        """Forward feature of a single scale level."""
+
+        cls_feat = cls_convs(x)
+        reg_feat = reg_convs(x)
+
+        cls_score = conv_cls(cls_feat)
+        bbox_pred = conv_reg(reg_feat)
+        objectness = conv_obj(reg_feat)
+
+        if cls_score.device.type == "hpu":
+            # put on cpu for further post-processing
+            cls_score = cls_score.cpu()
+            bbox_pred = bbox_pred.cpu()
+            objectness = objectness.cpu()
+
+        return cls_score, bbox_pred, objectness
+
 
 @HEADS.register_module()
 class CustomYOLOXHeadTrackingLossDynamics(TrackingLossDynamicsMixIn, CustomYOLOXHead):
@@ -152,14 +171,6 @@ def loss(self, cls_scores, bbox_preds, objectnesses, gt_bboxes, gt_labels, img_m
         flatten_priors = torch.cat(mlvl_priors)
         flatten_bboxes = self._bbox_decode(flatten_priors, flatten_bbox_preds)
 
-        if "hpu" in flatten_cls_preds.device:
-            # put loss computastion on CPU -> faster, avoid errors
-            flatten_cls_preds = flatten_cls_preds.cpu()
-            flatten_bbox_preds = flatten_bbox_preds.cpu()
-            flatten_objectness = flatten_objectness.cpu()
-            flatten_priors = flatten_priors.cpu()
-            flatten_bboxes = flatten_bboxes.cpu()
-
         # Init variables for loss dynamics tracking
         self.cur_batch_idx = 0
         self.max_gt_bboxes_len = max([len(gt_bbox) for gt_bbox in gt_bboxes])

From 2455fe7d922a53dc59f78acbf2ae5947b3f0255b Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Tue, 7 Nov 2023 16:59:25 +0000
Subject: [PATCH 05/16] SSD, ATSS e2e training

---
 .../mmcv/hooks/recording_forward_hook.py      |  2 ++
 .../utils/_builder_build_data_parallel.py     |  4 +--
 .../detectors/custom_single_stage_detector.py | 11 ------
 .../mmdet/models/heads/custom_atss_head.py    | 35 +++++++++++++++++++
 .../mmdet/models/heads/custom_ssd_head.py     | 19 +++++-----
 .../detection/mobilenetv2_ssd/template.yaml   |  2 +-
 .../recipes/stages/detection/incremental.py   | 32 ++++++++---------
 7 files changed, 64 insertions(+), 41 deletions(-)

diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py b/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py
index 062cc230367..d4df8bbbc22 100644
--- a/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py
+++ b/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py
@@ -74,6 +74,8 @@ def _recording_forward(
     ):  # pylint: disable=unused-argument
         tensors = self.func(output)
         if isinstance(tensors, torch.Tensor):
+            if tensors.dtype == torch.bfloat16:
+                tensors = tensors.to(torch.float32)
             tensors_np = tensors.detach().cpu().numpy()
         elif isinstance(tensors, np.ndarray):
             tensors_np = tensors
diff --git a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
index 39c9bf5f7b3..df1a651bbc8 100644
--- a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
+++ b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
@@ -64,8 +64,8 @@ def build_data_parallel(
         model = model.xpu()
         model = XPUDataParallel(model, device_ids=config.gpu_ids)
     elif is_hpu_available() and config.get("gpu_ids", []):
-        model = model.hpu()
-        model = HPUDataParallel(model, device_ids=config.gpu_ids)
+        model = model.to("hpu")
+        model = HPUDataParallel(model, device_ids=config.gpu_ids, put_gt_on_device=False)
     elif torch.cuda.is_available() and config.get("gpu_ids", []):
         if distributed:
             model = model.cuda()
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
index 702da56c428..07bce4440ed 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
@@ -40,7 +40,6 @@ class CustomSingleStageDetector(SAMDetectorMixin, DetLossDynamicsTrackingMixin,
 
     def __init__(self, *args, task_adapt=None, **kwargs):
         super().__init__(*args, **kwargs)
-
         # Hook for class-sensitive weight loading
         if task_adapt:
             self._register_load_state_dict_pre_hook(
@@ -76,9 +75,7 @@ def forward_train(self, img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=N
         batch_input_shape = tuple(img[0].size()[-2:])
         for img_meta in img_metas:
             img_meta["batch_input_shape"] = batch_input_shape
-        ttt = time.time()
         x = self.extract_feat(img)
-        print("extract_feat", time.time() - ttt)
         losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore, **kwargs)
         return losses
 
@@ -100,14 +97,6 @@ def simple_test(self, img, img_metas, rescale=False):
         results_list = self.bbox_head.simple_test(
             feat, img_metas, rescale=rescale)
 
-        # bbox_results = []
-        # for det_bboxes, det_labels in results_list:
-        #     if det_bboxes.dtype == torch.bfloat16:
-        #         det_bboxes = det_bboxes.to(torch.float32)
-        #         det_labels = det_labels.to(torch.float32)
-        #         bbox_results.append(bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes))
-        #     else:
-        #         bbox_results.append(bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes))
         bbox_results = [
             bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
             for det_bboxes, det_labels in results_list
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py
index 477790e0d4d..3d708de4460 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py
@@ -49,6 +49,41 @@ def __init__(self, *args, bg_loss_weight=-1.0, use_qfl=False, qfl_cfg=None, **kw
         self.bg_loss_weight = bg_loss_weight
         self.use_qfl = use_qfl
 
+    def forward_single(self, x, scale):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale
+                    level, the channels number is num_anchors * 4.
+                centerness (Tensor): Centerness for a single scale level, the
+                    channel number is (N, num_anchors * 1, H, W).
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.atss_cls(cls_feat)
+        # we just follow atss, not apply exp in bbox_pred
+        bbox_pred = scale(self.atss_reg(reg_feat)).float()
+        centerness = self.atss_centerness(reg_feat)
+        if cls_score.device.type == "hpu":
+            # put further post-processing on cpu
+            cls_score = cls_score.cpu()
+            bbox_pred = bbox_pred.cpu()
+            centerness = centerness.cpu()
+
+        return cls_score, bbox_pred, centerness
+
     @force_fp32(apply_to=("cls_scores", "bbox_preds", "centernesses"))
     def loss(self, cls_scores, bbox_preds, centernesses, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None):
         """Compute losses of the head.
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py
index 42f37571457..1ab2e6a8bbe 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py
@@ -100,12 +100,16 @@ def forward(self, feats):
         """
         cls_scores = []
         bbox_preds = []
-        start = time.time()
         for feat, reg_conv, cls_conv in zip(feats, self.reg_convs,
                                             self.cls_convs):
-            cls_scores.append(cls_conv(feat))
-            bbox_preds.append(reg_conv(feat))
-        print("bbox_head_forward:  ", time.time() - start)
+            cls_out = cls_conv(feat)
+            reg_out = reg_conv(feat)
+            if cls_out.device.type == "hpu":
+                cls_scores.append(cls_out.cpu())
+                bbox_preds.append(reg_out.cpu())
+            else:
+                cls_scores.append(cls_out)
+                bbox_preds.append(reg_out)
         return cls_scores, bbox_preds
 
     def loss_single(
@@ -145,7 +149,6 @@ def loss_single(
         """
 
         # Re-weigting BG loss
-        start1 = time.time()
         label_weights = label_weights.reshape(-1)
         if self.bg_loss_weight >= 0.0:
             neg_indices = labels == self.num_classes
@@ -153,7 +156,6 @@ def loss_single(
             label_weights[neg_indices] = self.bg_loss_weight
 
         loss_cls_all = self.loss_cls(cls_score, labels, label_weights)
-        print("loss_cls_all:  ", time.time() - start1)
         if len(loss_cls_all.shape) > 1:
             loss_cls_all = loss_cls_all.sum(-1)
         # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
@@ -175,10 +177,7 @@ def loss_single(
 
         # TODO: We need to verify that this is working properly.
         # pylint: disable=redundant-keyword-arg
-        start = time.time()
         loss_bbox = self._get_loss_bbox(bbox_pred, bbox_targets, bbox_weights, num_total_samples)
-        print("loss_bbox:  ", time.time() - start)
-        print("loss_single:  ", time.time() - start1)
         return loss_cls[None], loss_bbox
 
     def _get_pos_inds(self, labels):
@@ -204,9 +203,7 @@ def _get_loss_cls(self, num_total_samples, loss_cls_all, pos_inds, topk_loss_cls
 
     def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None):
         """Loss function."""
-        start = time.time()
         losses = super().loss(cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore)
-        print("loss_ALL:  ", time.time() - start)
         losses_cls = losses["loss_cls"]
         losses_bbox = losses["loss_bbox"]
 
diff --git a/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/template.yaml b/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/template.yaml
index 3e8768caad4..7b517542b35 100644
--- a/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/template.yaml
+++ b/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/template.yaml
@@ -36,7 +36,7 @@ hyper_parameters:
       learning_rate_warmup_iters:
         default_value: 3
       num_iters:
-        default_value: 50
+        default_value: 200
     nncf_optimization:
       enable_quantization:
         default_value: true
diff --git a/src/otx/recipes/stages/detection/incremental.py b/src/otx/recipes/stages/detection/incremental.py
index a78cea85e93..692a7dbee7c 100644
--- a/src/otx/recipes/stages/detection/incremental.py
+++ b/src/otx/recipes/stages/detection/incremental.py
@@ -21,24 +21,24 @@
         interval=1,
         priority=75,
     ),
-    # dict(
-    #     type="EMAHook",
-    #     priority="ABOVE_NORMAL",
-    #     momentum=0.1,
-    # ),
+    dict(
+        type="EMAHook",
+        priority="ABOVE_NORMAL",
+        momentum=0.1,
+    ),
 ]
 
-# lr_config = dict(
-#     policy="ReduceLROnPlateau",
-#     metric="mAP",
-#     patience=5,
-#     iteration_patience=0,
-#     interval=1,
-#     min_lr=1e-06,
-#     warmup="linear",
-#     warmup_iters=200,
-#     warmup_ratio=0.3333333333333333,
-# )
+lr_config = dict(
+    policy="ReduceLROnPlateau",
+    metric="mAP",
+    patience=5,
+    iteration_patience=0,
+    interval=1,
+    min_lr=1e-06,
+    warmup="linear",
+    warmup_iters=200,
+    warmup_ratio=0.3333333333333333,
+)
 
 ignore = True
 adaptive_validation_interval = dict(

From 322aaf265c4b13562033e321a7e3484d29651e6e Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Wed, 8 Nov 2023 12:31:13 +0000
Subject: [PATCH 06/16] stabilize mask rcnn a bit

---
 .../adapters/mmdet/models/heads/__init__.py   |  2 +
 .../mmdet/models/heads/custom_roi_head.py     | 79 +++++++++++++++++++
 .../mmdet/models/heads/custom_rpn_head.py     | 26 ++++++
 .../resnet50_maskrcnn/model.py                |  2 +-
 .../resnet50_maskrcnn/template.yaml           |  4 +-
 5 files changed, 110 insertions(+), 3 deletions(-)
 create mode 100644 src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py

diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py
index 28da39d0a1b..a0a410f3035 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py
@@ -14,6 +14,7 @@
 from .custom_vfnet_head import CustomVFNetHead
 from .custom_yolox_head import CustomYOLOXHead
 from .detr_head import DETRHeadExtension
+from .custom_rpn_head import CustomRPNHead
 
 __all__ = [
     "CrossDatasetDetectorHead",
@@ -27,6 +28,7 @@
     "CustomVFNetHead",
     "CustomYOLOXHead",
     "DETRHeadExtension",
+    "CustomRPNHead",
     # Loss dynamics tracking
     "CustomATSSHeadTrackingLossDynamics",
 ]
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
index 05902fc9e70..fbd8619c0e8 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
@@ -33,6 +33,39 @@ def init_bbox_head(self, bbox_roi_extractor, bbox_head):
             bbox_head.type = "CustomConvFCBBoxHead"
         self.bbox_head = build_head(bbox_head)
 
+
+    def _bbox_forward(self, x, rois):
+        """Box head forward function used in both training and testing."""
+        # TODO: a more flexible way to decide which feature maps to use
+        bbox_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        if self.with_shared_head:
+            bbox_feats = self.shared_head(bbox_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def _mask_forward(self, x, rois=None, pos_inds=None, bbox_feats=None):
+        """Mask head forward function used in both training and testing."""
+        assert ((rois is not None) ^
+                (pos_inds is not None and bbox_feats is not None))
+        if rois is not None:
+            mask_feats = self.mask_roi_extractor(
+                x[:self.mask_roi_extractor.num_inputs], rois)
+            if self.with_shared_head:
+                mask_feats = self.shared_head(mask_feats)
+        else:
+            assert bbox_feats is not None
+            mask_feats = bbox_feats[pos_inds]
+
+        mask_pred = self.mask_head(mask_feats)
+        if mask_pred.device.type == "hpu":
+            mask_pred = mask_pred.cpu()
+        mask_results = dict(mask_pred=mask_pred, mask_feats=mask_feats)
+        return mask_results
+
     def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels, img_metas):
         """Run forward function and calculate loss for box head in training."""
         rois = bbox2roi([res.bboxes for res in sampling_results])
@@ -125,6 +158,52 @@ def get_targets(self, sampling_results, gt_bboxes, gt_labels, img_metas, rcnn_tr
             valid_label_mask = torch.cat(valid_label_mask, 0)
         return labels, label_weights, bbox_targets, bbox_weights, valid_label_mask
 
+
+    def forward(self, x):
+        # shared part
+        if self.num_shared_convs > 0:
+            for conv in self.shared_convs:
+                x = conv(x)
+
+        if self.num_shared_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+
+            x = x.flatten(1)
+
+            for fc in self.shared_fcs:
+                x = self.relu(fc(x))
+        # separate branches
+        x_cls = x
+        x_reg = x
+
+        for conv in self.cls_convs:
+            x_cls = conv(x_cls)
+        if x_cls.dim() > 2:
+            if self.with_avg_pool:
+                x_cls = self.avg_pool(x_cls)
+            x_cls = x_cls.flatten(1)
+        for fc in self.cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+
+        for conv in self.reg_convs:
+            x_reg = conv(x_reg)
+        if x_reg.dim() > 2:
+            if self.with_avg_pool:
+                x_reg = self.avg_pool(x_reg)
+            x_reg = x_reg.flatten(1)
+        for fc in self.reg_fcs:
+            x_reg = self.relu(fc(x_reg))
+
+        cls_score = self.fc_cls(x_cls) if self.with_cls else None
+        bbox_pred = self.fc_reg(x_reg) if self.with_reg else None
+        if cls_score.device.type == 'hpu':
+            cls_score = cls_score.cpu()
+            bbox_pred = bbox_pred.cpu()
+
+        return cls_score, bbox_pred
+
+
     @force_fp32(apply_to=("cls_score", "bbox_pred"))
     def loss(
         self,
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py
new file mode 100644
index 00000000000..84d23757cca
--- /dev/null
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py
@@ -0,0 +1,26 @@
+import torch.nn.functional as F
+from mmdet.models.dense_heads import RPNHead
+from mmdet.models.builder import HEADS
+
+
+@HEADS.register_module()
+class CustomRPNHead(RPNHead):
+    """RPN head.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+        num_convs (int): Number of convolution layers in the head. Default 1.
+    """
+
+    def forward_single(self, x):
+        """Forward feature map of a single scale level."""
+        x = self.rpn_conv(x)
+        x = F.relu(x, inplace=False)
+        rpn_cls_score = self.rpn_cls(x)
+        rpn_bbox_pred = self.rpn_reg(x)
+        if rpn_cls_score.device.type == "hpu":
+            rpn_cls_score = rpn_cls_score.cpu()
+            rpn_bbox_pred = rpn_bbox_pred.cpu()
+
+        return rpn_cls_score, rpn_bbox_pred
diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py b/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py
index 6832028e425..d8918edc33f 100644
--- a/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py
+++ b/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py
@@ -33,7 +33,7 @@
         num_outs=5,
     ),
     rpn_head=dict(
-        type="RPNHead",
+        type="CustomRPNHead",
         in_channels=256,
         feat_channels=256,
         anchor_generator=dict(
diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/template.yaml b/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/template.yaml
index 17a74b1c25e..a77d5a22c47 100644
--- a/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/template.yaml
+++ b/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/template.yaml
@@ -26,10 +26,10 @@ hyper_parameters:
   parameter_overrides:
     learning_parameters:
       batch_size:
-        default_value: 4
+        default_value: 16
         auto_hpo_state: POSSIBLE
       inference_batch_size:
-        default_value: 1
+        default_value: 16
       learning_rate:
         default_value: 0.007
         auto_hpo_state: POSSIBLE

From 16b8d67351d5b997a99d7387818cac5d8298b2aa Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Wed, 8 Nov 2023 12:42:46 +0000
Subject: [PATCH 07/16] don't put gt on hpu for OD

---
 .../adapters/mmcv/utils/_builder_build_data_parallel.py      | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
index df1a651bbc8..ae865bd10b6 100644
--- a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
+++ b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
@@ -140,9 +140,10 @@ def val_step(self, *inputs, **kwargs):
 
 
 class HPUDataParallel(MMDataParallel):
-    def __init__(self, *args, enable_autocast: bool = False, **kwargs):
+    def __init__(self, *args, enable_autocast: bool = False, put_gt_on_device=True, **kwargs):
         super().__init__(*args, **kwargs)
         self.enable_autocast = enable_autocast
+        self.put_gt_on_device = put_gt_on_device
         self.src_device_obj = torch.device("hpu", self.device_ids[0])
 
     def scatter(self, inputs, kwargs, device_ids):
@@ -153,6 +154,8 @@ def scatter(self, inputs, kwargs, device_ids):
                 for val in x:
                     if isinstance(val, dict):
                         for k in val:
+                            if not self.put_gt_on_device and k.startswith("gt_"):
+                                continue
                             if isinstance(val[k], torch.Tensor):
                                 val[k] = val[k].to(self.src_device_obj)
                             elif isinstance(val[k], list):

From 508e0afbfe4a07951253c7bf44eee28d90b4f0c6 Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Wed, 8 Nov 2023 12:45:25 +0000
Subject: [PATCH 08/16] minor fix

---
 .../algorithms/common/adapters/mmcv/configurer.py    | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/otx/algorithms/common/adapters/mmcv/configurer.py b/src/otx/algorithms/common/adapters/mmcv/configurer.py
index 3d0e936aef2..f2c6aaec6ee 100644
--- a/src/otx/algorithms/common/adapters/mmcv/configurer.py
+++ b/src/otx/algorithms/common/adapters/mmcv/configurer.py
@@ -176,11 +176,7 @@ def configure_device(self, cfg):
         elif "gpu_ids" not in cfg:
             cfg.gpu_ids = range(1)
 
-<<<<<<< HEAD
-        # consider "cuda", "hpu" and "cpu" device only
-=======
         # consider "cuda", "xpu", "hpu" and "cpu" device only
->>>>>>> added support for OD on habana
         if is_hpu_available():
             cfg.device = "hpu"
         elif torch.cuda.is_available():
@@ -188,10 +184,6 @@ def configure_device(self, cfg):
         elif is_xpu_available():
             try:
                 import intel_extension_for_pytorch as ipex  # noqa: F401
-<<<<<<< HEAD
-
-=======
->>>>>>> added support for OD on habana
                 cfg.device = "xpu"
             except ModuleNotFoundError:
                 cfg.device = "cpu"
@@ -270,7 +262,6 @@ def configure_fp16(cfg: Config):
         distributed = getattr(cfg, "distributed", False)
         opts: Dict[str, Any] = {}
         if fp16_config is not None:
-<<<<<<< HEAD
             if is_hpu_available():
                 if optim_type == "SAMOptimizerHook":
                     # TODO (sungchul): consider SAM optimizer
@@ -278,9 +269,6 @@ def configure_fp16(cfg: Config):
                 opts["type"] = "HPUOptimizerHook"
                 cfg.optimizer_config.update(opts)
             elif torch.cuda.is_available() or is_xpu_available():
-=======
-            if torch.cuda.is_available() or is_xpu_available():
->>>>>>> added support for OD on habana
                 opts.update({"distributed": distributed, **fp16_config})
                 if optim_type == "SAMOptimizerHook":
                     opts["type"] = "Fp16SAMOptimizerHook"

From 7f2c17831decec355e90fdaa956f62727d18cf29 Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Wed, 8 Nov 2023 15:10:45 +0000
Subject: [PATCH 09/16]  Enable e2e training for Instance Segmentation.

---
 .../common/adapters/mmcv/hooks/__init__.py    |  5 +--
 .../adapters/mmcv/hooks/hpu_optimizer_hook.py |  1 +
 .../detection/adapters/mmdet/apis/train.py    | 24 ++++++++++---
 .../mmdet/models/heads/custom_roi_head.py     | 36 +++++++++++++++++++
 .../instance_segmentation/configuration.yaml  |  4 +--
 .../efficientnetb2b_maskrcnn/model.py         |  2 +-
 .../maskrcnn_swin_t/model.py                  |  3 +-
 .../resnet50_maskrcnn/model.py                |  2 +-
 .../instance-segmentation/incremental.py      |  6 ++--
 .../stages/instance-segmentation/train.py     |  7 ++--
 10 files changed, 69 insertions(+), 21 deletions(-)

diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/__init__.py b/src/otx/algorithms/common/adapters/mmcv/hooks/__init__.py
index a6dd4ea965c..75113aefa33 100644
--- a/src/otx/algorithms/common/adapters/mmcv/hooks/__init__.py
+++ b/src/otx/algorithms/common/adapters/mmcv/hooks/__init__.py
@@ -52,7 +52,6 @@
 from .semisl_cls_hook import SemiSLClsHook
 from .task_adapt_hook import TaskAdaptHook
 from .two_crop_transform_hook import TwoCropTransformHook
-from .hpu_optimizer_hook import HPUOptimizerHook, HPUDistOptimizerHook
 
 __all__ = [
     "AdaptiveRepeatDataHook",
@@ -90,9 +89,7 @@
     "TwoCropTransformHook",
     "MeanTeacherHook",
     "MemCacheHook",
-    "LossDynamicsTrackingHook",
-    "HPUOptimizerHook",
-    "HPUDistOptimizerHook",
+    "LossDynamicsTrackingHook"
 ]
 
 try:
diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py b/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py
index f5e26c49083..292cbe8aa18 100644
--- a/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py
+++ b/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py
@@ -17,6 +17,7 @@ def after_train_iter(self, runner):
         runner.optimizer.zero_grad()
         if self.detect_anomalous_params:
             self.detect_anomalous_parameters(runner.outputs["loss"], runner)
+
         runner.outputs["loss"].backward()
         htcore.mark_step()
 
diff --git a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
index 819e51f8f6b..802ed4e467e 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
@@ -26,6 +26,8 @@
 
 from habana_frameworks.torch.utils.library_loader import load_habana_module
 from otx.algorithms.common.adapters.mmcv.utils import XPUDataParallel, HPUDataParallel
+from otx.algorithms.common.adapters.mmcv.utils.hpu_optimizers import HABANA_OPTIMIZERS
+
 
 ext_module = ext_loader.load_ext("_ext", ["nms", "softnms", "nms_match", "nms_rotated", "nms_quadri"])
 dp_factory["xpu"] = XPUDataParallel
@@ -127,7 +129,7 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
         os.environ["PT_HPU_LAZY_MODE"] = "1"
         assert len(cfg.gpu_ids) == 1
         model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids, dim=0,
-                         is_autocast=bool(fp16_cfg), put_gt_on_device=False)
+                         enable_autocast=bool(fp16_cfg), put_gt_on_device=False)
         model.to(f"hpu:{cfg.gpu_ids[0]}", non_blocking=True)
         htcore.mark_step()
         model.zero_grad()
@@ -136,6 +138,8 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
 
     # build optimizer
     auto_scale_lr(cfg, distributed, logger)
+    if cfg.device == "hpu":
+        cfg.optimizer = patch_optimizer(cfg.optimizer)
     optimizer = build_optimizer(model, cfg.optimizer)
 
     if cfg.device == "xpu":
@@ -152,10 +156,10 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
     if cfg.device == "hpu":
         NMSop.forward = monkey_patched_xpu_nms
         RoIAlign.forward = monkey_patched_xpu_roi_align
-        from otx.algorithms.common.adapters.mmcv.optimizer.hpu_optimizer import register_habana_optimizers
-        habana_optimizers = register_habana_optimizers()
-        if (new_type := "Fused" + cfg.optimizer.get("type", "SGD")) in habana_optimizers:
-            cfg.optimizer["type"] = new_type
+        # build runner
+        if cfg.device == "hpu":
+            if (new_type := "Fused" + cfg.optimizer.get("type", "SGD")) in HABANA_OPTIMIZERS:
+                cfg.optimizer["type"] = new_type
 
     runner = build_runner(
         cfg.runner, default_args=dict(model=model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)
@@ -217,6 +221,16 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
         runner.load_checkpoint(cfg.load_from)
     runner.run(data_loaders, cfg.workflow)
 
+def patch_optimizer(cfg_optim):
+    "Patch optimizer for OD and IS"
+    if cfg_optim["type"] == "SGD":
+        return cfg_optim
+
+    # Only SGD for OD and IS supported by now on HPU
+    cfg_optim["type"] = "SGD"
+    if "betas" in cfg_optim:
+        del cfg_optim["betas"]
+    return cfg_optim
 
 def monkey_patched_xpu_nms(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num):
     """Runs MMCVs NMS with torchvision.nms, or forces NMS from MMCV to run on CPU."""
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
index fbd8619c0e8..89652c80f5a 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
@@ -63,6 +63,8 @@ def _mask_forward(self, x, rois=None, pos_inds=None, bbox_feats=None):
         mask_pred = self.mask_head(mask_feats)
         if mask_pred.device.type == "hpu":
             mask_pred = mask_pred.cpu()
+            mask_feats = mask_feats.cpu()
+
         mask_results = dict(mask_pred=mask_pred, mask_feats=mask_feats)
         return mask_results
 
@@ -87,6 +89,40 @@ def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels, img_met
         bbox_results.update(loss_bbox=loss_bbox)
         return bbox_results
 
+    def _mask_forward_train(self, x, sampling_results, bbox_feats, gt_masks,
+                            img_metas):
+        """Run forward function and calculate loss for mask head in
+        training."""
+        if not self.share_roi_extractor:
+            pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+            mask_results = self._mask_forward(x, pos_rois)
+        else:
+            pos_inds = []
+            device = bbox_feats.device
+            for res in sampling_results:
+                pos_inds.append(
+                    torch.ones(
+                        res.pos_bboxes.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+                pos_inds.append(
+                    torch.zeros(
+                        res.neg_bboxes.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+            pos_inds = torch.cat(pos_inds)
+
+            mask_results = self._mask_forward(
+                x, pos_inds=pos_inds, bbox_feats=bbox_feats)
+
+        mask_targets = self.mask_head.get_targets(sampling_results, gt_masks,
+                                                  self.train_cfg)
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        loss_mask = self.mask_head.loss(mask_results['mask_pred'],
+                                        mask_targets, pos_labels)
+
+        mask_results.update(loss_mask=loss_mask, mask_targets=mask_targets)
+        return mask_results
 
 @HEADS.register_module()
 class CustomConvFCBBoxHead(Shared2FCBBoxHead, CrossDatasetDetectorHead):
diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml b/src/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml
index f0672ae5ff8..c49981e2c66 100644
--- a/src/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml
+++ b/src/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml
@@ -194,7 +194,7 @@ learning_parameters:
     warning: This is applied exclusively when early stopping is enabled.
   use_adaptive_interval:
     affects_outcome_of: TRAINING
-    default_value: true
+    default_value: false
     description: Depending on the size of iteration per epoch, adaptively update the validation interval and related values.
     editable: true
     header: Use adaptive validation interval
@@ -208,7 +208,7 @@ learning_parameters:
     warning: This will automatically control the patience and interval when early stopping is enabled.
   auto_adapt_batch_size:
     affects_outcome_of: TRAINING
-    default_value: Safe
+    default_value: None
     description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory.
     editable: true
     enum_name: BatchSizeAdaptType
diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/model.py b/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/model.py
index 72ca9481ef3..03cb21733dc 100644
--- a/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/model.py
+++ b/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/model.py
@@ -28,7 +28,7 @@
     type="CustomMaskRCNN",  # Use CustomMaskRCNN for Incremental Learning
     neck=dict(type="FPN", in_channels=[24, 48, 120, 352], out_channels=80, num_outs=5),
     rpn_head=dict(
-        type="RPNHead",
+        type="CustomRPNHead",
         in_channels=80,
         feat_channels=80,
         anchor_generator=dict(type="AnchorGenerator", scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]),
diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py b/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py
index 66f7522bdee..d6f95c1d366 100644
--- a/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py
+++ b/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py
@@ -38,7 +38,7 @@
     ),
     neck=dict(type="FPN", in_channels=[96, 192, 384, 768], out_channels=256, num_outs=5),
     rpn_head=dict(
-        type="RPNHead",
+        type="CustomRPNHead",
         in_channels=256,
         feat_channels=256,
         anchor_generator=dict(type="AnchorGenerator", scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]),
@@ -134,6 +134,7 @@
 )
 
 evaluation = dict(interval=1, metric="mAP", save_best="mAP", iou_thr=[0.5])
+
 optimizer = dict(
     _delete_=True,
     type="AdamW",
diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py b/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py
index d8918edc33f..d0e51ef26c9 100644
--- a/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py
+++ b/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py
@@ -159,5 +159,5 @@
 v2.0/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco/\
 mask_rcnn_r50_fpn_mstrain-poly_3x_coco_20210524_201154-21b550bb.pth"
 
-evaluation = dict(interval=1, metric="mAP", save_best="mAP", iou_thr=[0.5])
+evaluation = dict(interval=100, metric="mAP", save_best="mAP", iou_thr=[0.5])
 ignore = True
diff --git a/src/otx/recipes/stages/instance-segmentation/incremental.py b/src/otx/recipes/stages/instance-segmentation/incremental.py
index 93cda5428e7..d6c17f4767f 100644
--- a/src/otx/recipes/stages/instance-segmentation/incremental.py
+++ b/src/otx/recipes/stages/instance-segmentation/incremental.py
@@ -3,7 +3,7 @@
 task = "instance-segmentation"
 
 evaluation = dict(
-    interval=1, metric="mAP", save_best="mAP", iou_thr=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+    interval=100, metric="mAP", save_best="mAP", iou_thr=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
 )
 
 task_adapt = dict(
@@ -19,6 +19,6 @@
 ignore = True
 adaptive_validation_interval = dict(
     max_interval=5,
-    enable_adaptive_interval_hook=True,
-    enable_eval_before_run=True,
+    enable_adaptive_interval_hook=False,
+    enable_eval_before_run=False,
 )
diff --git a/src/otx/recipes/stages/instance-segmentation/train.py b/src/otx/recipes/stages/instance-segmentation/train.py
index 0ac963fa94d..a9bd72da3e0 100644
--- a/src/otx/recipes/stages/instance-segmentation/train.py
+++ b/src/otx/recipes/stages/instance-segmentation/train.py
@@ -1,14 +1,14 @@
 _base_ = [
     "../_base_/default.py",
     "../_base_/logs/tensorboard_logger.py",
-    "../_base_/optimizers/sgd.py",
+    "../_base_/optimizers/adam.py",
     "../_base_/runners/epoch_runner_cancel.py",
     "../_base_/schedules/plateau.py",
 ]
 
 optimizer = dict(
+    type="SGD",
     lr=0.001,
-    momentum=0.9,
     weight_decay=0.0001,
 )
 
@@ -26,7 +26,6 @@
 
 evaluation = dict(interval=1, metric="mAP", save_best="mAP")
 early_stop_metric = "mAP"
-
 custom_hooks = [
     dict(
         type="LazyEarlyStoppingHook",
@@ -40,7 +39,7 @@
     dict(
         type="AdaptiveTrainSchedulingHook",
         enable_adaptive_interval_hook=False,
-        enable_eval_before_run=True,
+        enable_eval_before_run=False,
     ),
     dict(type="LoggerReplaceHook"),
     dict(

From 9400ed52d80b134166ffe9b523da1f972367f5e5 Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Wed, 8 Nov 2023 15:41:01 +0000
Subject: [PATCH 10/16] clean the code stage 1

---
 .../common/adapters/mmcv/configurer.py        |   6 -
 .../common/adapters/mmcv/hooks/__init__.py    |   2 +-
 .../adapters/mmcv/hooks/hpu_optimizer_hook.py |   1 -
 .../utils/_builder_build_data_parallel.py     |   2 +-
 src/otx/algorithms/common/utils/__init__.py   |   1 -
 src/otx/algorithms/common/utils/utils.py      |   1 -
 .../detection/adapters/mmdet/apis/train.py    |   7 +-
 .../adapters/mmdet/evaluation/evaluator.py    |   5 +-
 .../detectors/custom_single_stage_detector.py |  29 +----
 .../mmdet/models/heads/custom_atss_head.py    |  11 +-
 .../mmdet/models/heads/custom_roi_head.py     | 109 +-----------------
 .../efficientnetb2b_maskrcnn/template.yaml    |   2 +-
 .../instance-segmentation/incremental.py      |   2 +-
 .../stages/instance-segmentation/train.py     |   2 +-
 14 files changed, 15 insertions(+), 165 deletions(-)

diff --git a/src/otx/algorithms/common/adapters/mmcv/configurer.py b/src/otx/algorithms/common/adapters/mmcv/configurer.py
index f2c6aaec6ee..bfdf3af4b71 100644
--- a/src/otx/algorithms/common/adapters/mmcv/configurer.py
+++ b/src/otx/algorithms/common/adapters/mmcv/configurer.py
@@ -280,12 +280,6 @@ def configure_fp16(cfg: Config):
                     cfg.fp16 = fp16_config
                     opts = dict()
                 cfg.optimizer_config.update(opts)
-            elif is_hpu_available():
-                if optim_type == "SAMOptimizerHook":
-                    # TODO (sungchul): consider SAM optimizer
-                    logger.warning("SAMOptimizerHook is not supported on HPU. Changed to OptimizerHook.")
-                opts["type"] = "HPUOptimizerHook"
-                cfg.optimizer_config.update(opts)
             else:
                 logger.info("Revert FP16 to FP32 on CPU device")
 
diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/__init__.py b/src/otx/algorithms/common/adapters/mmcv/hooks/__init__.py
index 75113aefa33..a7c41d80fee 100644
--- a/src/otx/algorithms/common/adapters/mmcv/hooks/__init__.py
+++ b/src/otx/algorithms/common/adapters/mmcv/hooks/__init__.py
@@ -89,7 +89,7 @@
     "TwoCropTransformHook",
     "MeanTeacherHook",
     "MemCacheHook",
-    "LossDynamicsTrackingHook"
+    "LossDynamicsTrackingHook",
 ]
 
 try:
diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py b/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py
index 292cbe8aa18..f5e26c49083 100644
--- a/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py
+++ b/src/otx/algorithms/common/adapters/mmcv/hooks/hpu_optimizer_hook.py
@@ -17,7 +17,6 @@ def after_train_iter(self, runner):
         runner.optimizer.zero_grad()
         if self.detect_anomalous_params:
             self.detect_anomalous_parameters(runner.outputs["loss"], runner)
-
         runner.outputs["loss"].backward()
         htcore.mark_step()
 
diff --git a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
index ae865bd10b6..d20cd540ed1 100644
--- a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
+++ b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
@@ -65,7 +65,7 @@ def build_data_parallel(
         model = XPUDataParallel(model, device_ids=config.gpu_ids)
     elif is_hpu_available() and config.get("gpu_ids", []):
         model = model.to("hpu")
-        model = HPUDataParallel(model, device_ids=config.gpu_ids, put_gt_on_device=False)
+        model = HPUDataParallel(model, device_ids=config.gpu_ids)
     elif torch.cuda.is_available() and config.get("gpu_ids", []):
         if distributed:
             model = model.cuda()
diff --git a/src/otx/algorithms/common/utils/__init__.py b/src/otx/algorithms/common/utils/__init__.py
index 23bb01ed20f..6395bd6e60d 100644
--- a/src/otx/algorithms/common/utils/__init__.py
+++ b/src/otx/algorithms/common/utils/__init__.py
@@ -30,7 +30,6 @@
     get_task_class,
     is_hpu_available,
     is_xpu_available,
-    is_hpu_available,
     load_template,
     read_py_config,
     set_random_seed,
diff --git a/src/otx/algorithms/common/utils/utils.py b/src/otx/algorithms/common/utils/utils.py
index c9b651244a2..92e1b2f0853 100644
--- a/src/otx/algorithms/common/utils/utils.py
+++ b/src/otx/algorithms/common/utils/utils.py
@@ -17,7 +17,6 @@
 import torch
 import yaml
 from addict import Dict as adict
-import habana_frameworks.torch as htorch
 
 HPU_AVAILABLE = None
 try:
diff --git a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
index 802ed4e467e..ef13e8d84b3 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
@@ -24,7 +24,6 @@
 from torchvision.ops import nms as tv_nms
 from torchvision.ops import roi_align as tv_roi_align
 
-from habana_frameworks.torch.utils.library_loader import load_habana_module
 from otx.algorithms.common.adapters.mmcv.utils import XPUDataParallel, HPUDataParallel
 from otx.algorithms.common.adapters.mmcv.utils.hpu_optimizers import HABANA_OPTIMIZERS
 
@@ -32,7 +31,6 @@
 ext_module = ext_loader.load_ext("_ext", ["nms", "softnms", "nms_match", "nms_rotated", "nms_quadri"])
 dp_factory["xpu"] = XPUDataParallel
 dp_factory["hpu"] = HPUDataParallel
-load_habana_module()
 
 
 def auto_scale_lr(cfg, distributed, logger):
@@ -125,12 +123,14 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
         model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids, enable_autocast=bool(fp16_cfg))
         model.to(f"xpu:{cfg.gpu_ids[0]}")
     elif cfg.device == "hpu":
+        from habana_frameworks.torch.utils.library_loader import load_habana_module
         import habana_frameworks.torch.core as htcore
+        load_habana_module()
         os.environ["PT_HPU_LAZY_MODE"] = "1"
         assert len(cfg.gpu_ids) == 1
         model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids, dim=0,
                          enable_autocast=bool(fp16_cfg), put_gt_on_device=False)
-        model.to(f"hpu:{cfg.gpu_ids[0]}", non_blocking=True)
+        model.to(model.src_device_obj)
         htcore.mark_step()
         model.zero_grad()
     else:
@@ -254,6 +254,7 @@ def monkey_patched_xpu_nms(ctx, bboxes, scores, iou_threshold, offset, score_thr
         inds = ext_module.nms(bboxes, scores, iou_threshold=float(iou_threshold), offset=offset)
         bboxes = bboxes.to(device)
         scores = scores.to(device)
+
     if max_num > 0:
         inds = inds[:max_num]
     if is_filtering_by_score:
diff --git a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
index 553b9bc7543..715c260d371 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
@@ -33,7 +33,6 @@
 from otx.api.utils.time_utils import timeit
 
 
-
 def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
     mean_ap, results, dataset=None, scale_ranges=None, logger=None
 ):
@@ -64,6 +63,7 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
 
     segmentation = "miou" in results
     num_classes = len(results)
+
     recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
     aps = np.zeros((num_scales, num_classes), dtype=np.float32)
     num_gts = np.zeros((num_scales, num_classes), dtype=int)
@@ -386,7 +386,7 @@ def evaluate(self, results, logger, iou_thr, scale_ranges):
             metric: mAP and mIoU metric
         """
         if self.domain == Domain.DETECTION:
-            output = eval_map(
+            return eval_map(
                 results,
                 self.annotation,
                 scale_ranges=scale_ranges,
@@ -394,5 +394,4 @@ def evaluate(self, results, logger, iou_thr, scale_ranges):
                 dataset=self.classes,
                 logger=logger,
             )
-            return output
         return self.evaluate_mask(results, logger, iou_thr)
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
index 07bce4440ed..e5587270545 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
@@ -6,11 +6,8 @@
 import functools
 
 import torch
-import time
 from mmdet.models.builder import DETECTORS
 from mmdet.models.detectors.single_stage import SingleStageDetector
-from mmdet.core import bbox2result
-
 from otx.algorithms.common.adapters.mmcv.hooks.recording_forward_hook import (
     FeatureVectorHook,
 )
@@ -36,7 +33,7 @@
 class CustomSingleStageDetector(SAMDetectorMixin, DetLossDynamicsTrackingMixin, L2SPDetectorMixin, SingleStageDetector):
     """SAM optimizer & L2SP regularizer enabled custom SSD."""
 
-    # TRACKING_LOSS_TYPE = (TrackingLossType.cls, TrackingLossType.bbox)
+    TRACKING_LOSS_TYPE = (TrackingLossType.cls, TrackingLossType.bbox)
 
     def __init__(self, *args, task_adapt=None, **kwargs):
         super().__init__(*args, **kwargs)
@@ -79,30 +76,6 @@ def forward_train(self, img, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=N
         losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore, **kwargs)
         return losses
 
-    def simple_test(self, img, img_metas, rescale=False):
-        """Test function without test-time augmentation.
-
-        Args:
-            img (torch.Tensor): Images with shape (N, C, H, W).
-            img_metas (list[dict]): List of image information.
-            rescale (bool, optional): Whether to rescale the results.
-                Defaults to False.
-
-        Returns:
-            list[list[np.ndarray]]: BBox results of each image and classes.
-                The outer list corresponds to each image. The inner list
-                corresponds to each class.
-        """
-        feat = self.extract_feat(img)
-        results_list = self.bbox_head.simple_test(
-            feat, img_metas, rescale=rescale)
-
-        bbox_results = [
-            bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
-            for det_bboxes, det_labels in results_list
-        ]
-        return bbox_results
-
     @staticmethod
     def load_state_dict_pre_hook(model, model_classes, chkpt_classes, chkpt_dict, prefix, *args, **kwargs):
         """Modify input state_dict according to class name matching before weight loading."""
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py
index 3d708de4460..41b7fd3aa8b 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py
@@ -66,16 +66,7 @@ def forward_single(self, x, scale):
                 centerness (Tensor): Centerness for a single scale level, the
                     channel number is (N, num_anchors * 1, H, W).
         """
-        cls_feat = x
-        reg_feat = x
-        for cls_conv in self.cls_convs:
-            cls_feat = cls_conv(cls_feat)
-        for reg_conv in self.reg_convs:
-            reg_feat = reg_conv(reg_feat)
-        cls_score = self.atss_cls(cls_feat)
-        # we just follow atss, not apply exp in bbox_pred
-        bbox_pred = scale(self.atss_reg(reg_feat)).float()
-        centerness = self.atss_centerness(reg_feat)
+        cls_score, bbox_pred, centerness = super().forward_single(x, scale)
         if cls_score.device.type == "hpu":
             # put further post-processing on cpu
             cls_score = cls_score.cpu()
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
index 89652c80f5a..247d8065797 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
@@ -33,20 +33,6 @@ def init_bbox_head(self, bbox_roi_extractor, bbox_head):
             bbox_head.type = "CustomConvFCBBoxHead"
         self.bbox_head = build_head(bbox_head)
 
-
-    def _bbox_forward(self, x, rois):
-        """Box head forward function used in both training and testing."""
-        # TODO: a more flexible way to decide which feature maps to use
-        bbox_feats = self.bbox_roi_extractor(
-            x[:self.bbox_roi_extractor.num_inputs], rois)
-        if self.with_shared_head:
-            bbox_feats = self.shared_head(bbox_feats)
-        cls_score, bbox_pred = self.bbox_head(bbox_feats)
-
-        bbox_results = dict(
-            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
-        return bbox_results
-
     def _mask_forward(self, x, rois=None, pos_inds=None, bbox_feats=None):
         """Mask head forward function used in both training and testing."""
         assert ((rois is not None) ^
@@ -68,61 +54,6 @@ def _mask_forward(self, x, rois=None, pos_inds=None, bbox_feats=None):
         mask_results = dict(mask_pred=mask_pred, mask_feats=mask_feats)
         return mask_results
 
-    def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels, img_metas):
-        """Run forward function and calculate loss for box head in training."""
-        rois = bbox2roi([res.bboxes for res in sampling_results])
-        bbox_results = self._bbox_forward(x, rois)
-
-        labels, label_weights, bbox_targets, bbox_weights, valid_label_mask = self.bbox_head.get_targets(
-            sampling_results, gt_bboxes, gt_labels, img_metas, self.train_cfg
-        )
-        loss_bbox = self.bbox_head.loss(
-            bbox_results["cls_score"],
-            bbox_results["bbox_pred"],
-            rois,
-            labels,
-            label_weights,
-            bbox_targets,
-            bbox_weights,
-            valid_label_mask=valid_label_mask,
-        )
-        bbox_results.update(loss_bbox=loss_bbox)
-        return bbox_results
-
-    def _mask_forward_train(self, x, sampling_results, bbox_feats, gt_masks,
-                            img_metas):
-        """Run forward function and calculate loss for mask head in
-        training."""
-        if not self.share_roi_extractor:
-            pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
-            mask_results = self._mask_forward(x, pos_rois)
-        else:
-            pos_inds = []
-            device = bbox_feats.device
-            for res in sampling_results:
-                pos_inds.append(
-                    torch.ones(
-                        res.pos_bboxes.shape[0],
-                        device=device,
-                        dtype=torch.uint8))
-                pos_inds.append(
-                    torch.zeros(
-                        res.neg_bboxes.shape[0],
-                        device=device,
-                        dtype=torch.uint8))
-            pos_inds = torch.cat(pos_inds)
-
-            mask_results = self._mask_forward(
-                x, pos_inds=pos_inds, bbox_feats=bbox_feats)
-
-        mask_targets = self.mask_head.get_targets(sampling_results, gt_masks,
-                                                  self.train_cfg)
-        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
-        loss_mask = self.mask_head.loss(mask_results['mask_pred'],
-                                        mask_targets, pos_labels)
-
-        mask_results.update(loss_mask=loss_mask, mask_targets=mask_targets)
-        return mask_results
 
 @HEADS.register_module()
 class CustomConvFCBBoxHead(Shared2FCBBoxHead, CrossDatasetDetectorHead):
@@ -194,52 +125,16 @@ def get_targets(self, sampling_results, gt_bboxes, gt_labels, img_metas, rcnn_tr
             valid_label_mask = torch.cat(valid_label_mask, 0)
         return labels, label_weights, bbox_targets, bbox_weights, valid_label_mask
 
-
     def forward(self, x):
+        '''ConvFCBBoxHead forward'''
         # shared part
-        if self.num_shared_convs > 0:
-            for conv in self.shared_convs:
-                x = conv(x)
-
-        if self.num_shared_fcs > 0:
-            if self.with_avg_pool:
-                x = self.avg_pool(x)
-
-            x = x.flatten(1)
-
-            for fc in self.shared_fcs:
-                x = self.relu(fc(x))
-        # separate branches
-        x_cls = x
-        x_reg = x
-
-        for conv in self.cls_convs:
-            x_cls = conv(x_cls)
-        if x_cls.dim() > 2:
-            if self.with_avg_pool:
-                x_cls = self.avg_pool(x_cls)
-            x_cls = x_cls.flatten(1)
-        for fc in self.cls_fcs:
-            x_cls = self.relu(fc(x_cls))
-
-        for conv in self.reg_convs:
-            x_reg = conv(x_reg)
-        if x_reg.dim() > 2:
-            if self.with_avg_pool:
-                x_reg = self.avg_pool(x_reg)
-            x_reg = x_reg.flatten(1)
-        for fc in self.reg_fcs:
-            x_reg = self.relu(fc(x_reg))
-
-        cls_score = self.fc_cls(x_cls) if self.with_cls else None
-        bbox_pred = self.fc_reg(x_reg) if self.with_reg else None
+        cls_score, bbox_pred = super().forward(self, x)
         if cls_score.device.type == 'hpu':
             cls_score = cls_score.cpu()
             bbox_pred = bbox_pred.cpu()
 
         return cls_score, bbox_pred
 
-
     @force_fp32(apply_to=("cls_score", "bbox_pred"))
     def loss(
         self,
diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/template.yaml b/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/template.yaml
index 272a648c551..82a27946ebb 100644
--- a/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/template.yaml
+++ b/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/template.yaml
@@ -36,7 +36,7 @@ hyper_parameters:
       learning_rate_warmup_iters:
         default_value: 100
       num_iters:
-        default_value: 100
+        default_value: 5
     pot_parameters:
       stat_requests_number:
         default_value: 1
diff --git a/src/otx/recipes/stages/instance-segmentation/incremental.py b/src/otx/recipes/stages/instance-segmentation/incremental.py
index d6c17f4767f..8e472d5c986 100644
--- a/src/otx/recipes/stages/instance-segmentation/incremental.py
+++ b/src/otx/recipes/stages/instance-segmentation/incremental.py
@@ -20,5 +20,5 @@
 adaptive_validation_interval = dict(
     max_interval=5,
     enable_adaptive_interval_hook=False,
-    enable_eval_before_run=False,
+    enable_eval_before_run=True,
 )
diff --git a/src/otx/recipes/stages/instance-segmentation/train.py b/src/otx/recipes/stages/instance-segmentation/train.py
index a9bd72da3e0..b866aed4b94 100644
--- a/src/otx/recipes/stages/instance-segmentation/train.py
+++ b/src/otx/recipes/stages/instance-segmentation/train.py
@@ -39,7 +39,7 @@
     dict(
         type="AdaptiveTrainSchedulingHook",
         enable_adaptive_interval_hook=False,
-        enable_eval_before_run=False,
+        enable_eval_before_run=True,
     ),
     dict(type="LoggerReplaceHook"),
     dict(

From 4d4cb91784f35c3d1b872aac632a76b34ef6d59b Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Wed, 8 Nov 2023 16:27:00 +0000
Subject: [PATCH 11/16] clean code 2

---
 .../mmdet/models/heads/custom_roi_head.py     | 44 +++++++++++--------
 .../mmdet/models/heads/custom_rpn_head.py     |  6 +--
 .../mmdet/models/heads/custom_yolox_head.py   | 15 ++-----
 .../detection/cspdarknet_yolox_x/model.py     |  2 +-
 .../cspdarknet_yolox_x/template.yaml          |  6 +--
 .../instance_segmentation/configuration.yaml  |  4 +-
 .../efficientnetb2b_maskrcnn/template.yaml    |  2 +-
 .../maskrcnn_swin_t/model.py                  |  1 -
 .../resnet50_maskrcnn/template.yaml           |  4 +-
 .../recipes/stages/detection/incremental.py   |  2 +-
 .../instance-segmentation/incremental.py      |  4 +-
 .../stages/instance-segmentation/train.py     |  4 +-
 12 files changed, 45 insertions(+), 49 deletions(-)

diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
index 247d8065797..45da297ca82 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
@@ -33,25 +33,33 @@ def init_bbox_head(self, bbox_roi_extractor, bbox_head):
             bbox_head.type = "CustomConvFCBBoxHead"
         self.bbox_head = build_head(bbox_head)
 
+    def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels, img_metas):
+        """Run forward function and calculate loss for box head in training."""
+        rois = bbox2roi([res.bboxes for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+
+        labels, label_weights, bbox_targets, bbox_weights, valid_label_mask = self.bbox_head.get_targets(
+            sampling_results, gt_bboxes, gt_labels, img_metas, self.train_cfg
+        )
+        loss_bbox = self.bbox_head.loss(
+            bbox_results["cls_score"],
+            bbox_results["bbox_pred"],
+            rois,
+            labels,
+            label_weights,
+            bbox_targets,
+            bbox_weights,
+            valid_label_mask=valid_label_mask,
+        )
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
+
     def _mask_forward(self, x, rois=None, pos_inds=None, bbox_feats=None):
         """Mask head forward function used in both training and testing."""
-        assert ((rois is not None) ^
-                (pos_inds is not None and bbox_feats is not None))
-        if rois is not None:
-            mask_feats = self.mask_roi_extractor(
-                x[:self.mask_roi_extractor.num_inputs], rois)
-            if self.with_shared_head:
-                mask_feats = self.shared_head(mask_feats)
-        else:
-            assert bbox_feats is not None
-            mask_feats = bbox_feats[pos_inds]
-
-        mask_pred = self.mask_head(mask_feats)
-        if mask_pred.device.type == "hpu":
-            mask_pred = mask_pred.cpu()
-            mask_feats = mask_feats.cpu()
-
-        mask_results = dict(mask_pred=mask_pred, mask_feats=mask_feats)
+        mask_results = super()._mask_forward(x, rois, pos_inds, bbox_feats)
+        if mask_results["mask_pred"].device.type == "hpu":
+            mask_results["mask_pred"] = mask_results["mask_pred"].cpu()
+            mask_results["mask_feats"] = mask_results["mask_feats"].cpu()
         return mask_results
 
 
@@ -128,7 +136,7 @@ def get_targets(self, sampling_results, gt_bboxes, gt_labels, img_metas, rcnn_tr
     def forward(self, x):
         '''ConvFCBBoxHead forward'''
         # shared part
-        cls_score, bbox_pred = super().forward(self, x)
+        cls_score, bbox_pred = super().forward(x)
         if cls_score.device.type == 'hpu':
             cls_score = cls_score.cpu()
             bbox_pred = bbox_pred.cpu()
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py
index 84d23757cca..6d7614bb844 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py
@@ -15,12 +15,8 @@ class CustomRPNHead(RPNHead):
 
     def forward_single(self, x):
         """Forward feature map of a single scale level."""
-        x = self.rpn_conv(x)
-        x = F.relu(x, inplace=False)
-        rpn_cls_score = self.rpn_cls(x)
-        rpn_bbox_pred = self.rpn_reg(x)
+        rpn_cls_score, rpn_bbox_pred = super().forward_single(x)
         if rpn_cls_score.device.type == "hpu":
             rpn_cls_score = rpn_cls_score.cpu()
             rpn_bbox_pred = rpn_bbox_pred.cpu()
-
         return rpn_cls_score, rpn_bbox_pred
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
index 69fb668e2f3..e47d891ea48 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
@@ -107,20 +107,13 @@ def loss(self, cls_scores, bbox_preds, objectnesses, gt_bboxes, gt_labels, img_m
     def forward_single(self, x, cls_convs, reg_convs, conv_cls, conv_reg,
                        conv_obj):
         """Forward feature of a single scale level."""
-
-        cls_feat = cls_convs(x)
-        reg_feat = reg_convs(x)
-
-        cls_score = conv_cls(cls_feat)
-        bbox_pred = conv_reg(reg_feat)
-        objectness = conv_obj(reg_feat)
-
+        cls_score, bbox_pred, objectness = super().forward_single(x, cls_convs, reg_convs, conv_cls, conv_reg,
+                               conv_obj)
         if cls_score.device.type == "hpu":
             # put on cpu for further post-processing
             cls_score = cls_score.cpu()
             bbox_pred = bbox_pred.cpu()
             objectness = objectness.cpu()
-
         return cls_score, bbox_pred, objectness
 
 
@@ -264,9 +257,7 @@ def _get_target_single(self, cls_preds, objectness, priors, decoded_bboxes, gt_b
         num_priors = priors.size(0)
         num_gts = gt_labels.size(0)
         gt_bboxes = gt_bboxes.to(decoded_bboxes.dtype)
-        if "hpu" in gt_bboxes.device:
-            gt_bboxes = gt_bboxes.cpu()
-            gt_labels = gt_labels.cpu()
+
         # No target
         if num_gts == 0:
             cls_target = cls_preds.new_zeros((0, self.num_classes))
diff --git a/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/model.py b/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/model.py
index e734996e4b1..857021810d1 100644
--- a/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/model.py
+++ b/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/model.py
@@ -20,5 +20,5 @@
 load_from = "https://download.openmmlab.com/mmdetection/v2.0/yolox\
 /yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth"
 
-fp16 = None
+fp16 = dict(loss_scale=512.0)
 ignore = False
diff --git a/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/template.yaml b/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/template.yaml
index ca1d67ef754..50e07835a96 100644
--- a/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/template.yaml
+++ b/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_x/template.yaml
@@ -26,17 +26,17 @@ hyper_parameters:
   parameter_overrides:
     learning_parameters:
       batch_size:
-        default_value: 16
+        default_value: 4
         auto_hpo_state: POSSIBLE
       inference_batch_size:
-        default_value: 16
+        default_value: 4
       learning_rate:
         default_value: 0.001
         auto_hpo_state: POSSIBLE
       learning_rate_warmup_iters:
         default_value: 3
       num_iters:
-        default_value: 20
+        default_value: 200
     nncf_optimization:
       enable_quantization:
         default_value: true
diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml b/src/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml
index c49981e2c66..f0672ae5ff8 100644
--- a/src/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml
+++ b/src/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml
@@ -194,7 +194,7 @@ learning_parameters:
     warning: This is applied exclusively when early stopping is enabled.
   use_adaptive_interval:
     affects_outcome_of: TRAINING
-    default_value: false
+    default_value: true
     description: Depending on the size of iteration per epoch, adaptively update the validation interval and related values.
     editable: true
     header: Use adaptive validation interval
@@ -208,7 +208,7 @@ learning_parameters:
     warning: This will automatically control the patience and interval when early stopping is enabled.
   auto_adapt_batch_size:
     affects_outcome_of: TRAINING
-    default_value: None
+    default_value: Safe
     description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory.
     editable: true
     enum_name: BatchSizeAdaptType
diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/template.yaml b/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/template.yaml
index 82a27946ebb..272a648c551 100644
--- a/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/template.yaml
+++ b/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/template.yaml
@@ -36,7 +36,7 @@ hyper_parameters:
       learning_rate_warmup_iters:
         default_value: 100
       num_iters:
-        default_value: 5
+        default_value: 100
     pot_parameters:
       stat_requests_number:
         default_value: 1
diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py b/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py
index d6f95c1d366..203470d2fac 100644
--- a/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py
+++ b/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py
@@ -134,7 +134,6 @@
 )
 
 evaluation = dict(interval=1, metric="mAP", save_best="mAP", iou_thr=[0.5])
-
 optimizer = dict(
     _delete_=True,
     type="AdamW",
diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/template.yaml b/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/template.yaml
index a77d5a22c47..17a74b1c25e 100644
--- a/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/template.yaml
+++ b/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/template.yaml
@@ -26,10 +26,10 @@ hyper_parameters:
   parameter_overrides:
     learning_parameters:
       batch_size:
-        default_value: 16
+        default_value: 4
         auto_hpo_state: POSSIBLE
       inference_batch_size:
-        default_value: 16
+        default_value: 1
       learning_rate:
         default_value: 0.007
         auto_hpo_state: POSSIBLE
diff --git a/src/otx/recipes/stages/detection/incremental.py b/src/otx/recipes/stages/detection/incremental.py
index 692a7dbee7c..9ddd2e28e55 100644
--- a/src/otx/recipes/stages/detection/incremental.py
+++ b/src/otx/recipes/stages/detection/incremental.py
@@ -43,6 +43,6 @@
 ignore = True
 adaptive_validation_interval = dict(
     max_interval=5,
-    enable_adaptive_interval_hook=False,
+    enable_adaptive_interval_hook=True,
     enable_eval_before_run=True,
 )
diff --git a/src/otx/recipes/stages/instance-segmentation/incremental.py b/src/otx/recipes/stages/instance-segmentation/incremental.py
index 8e472d5c986..93cda5428e7 100644
--- a/src/otx/recipes/stages/instance-segmentation/incremental.py
+++ b/src/otx/recipes/stages/instance-segmentation/incremental.py
@@ -3,7 +3,7 @@
 task = "instance-segmentation"
 
 evaluation = dict(
-    interval=100, metric="mAP", save_best="mAP", iou_thr=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+    interval=1, metric="mAP", save_best="mAP", iou_thr=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
 )
 
 task_adapt = dict(
@@ -19,6 +19,6 @@
 ignore = True
 adaptive_validation_interval = dict(
     max_interval=5,
-    enable_adaptive_interval_hook=False,
+    enable_adaptive_interval_hook=True,
     enable_eval_before_run=True,
 )
diff --git a/src/otx/recipes/stages/instance-segmentation/train.py b/src/otx/recipes/stages/instance-segmentation/train.py
index b866aed4b94..12e2110c7f8 100644
--- a/src/otx/recipes/stages/instance-segmentation/train.py
+++ b/src/otx/recipes/stages/instance-segmentation/train.py
@@ -1,7 +1,7 @@
 _base_ = [
     "../_base_/default.py",
     "../_base_/logs/tensorboard_logger.py",
-    "../_base_/optimizers/adam.py",
+    "../_base_/optimizers/sgd.py",
     "../_base_/runners/epoch_runner_cancel.py",
     "../_base_/schedules/plateau.py",
 ]
@@ -9,6 +9,7 @@
 optimizer = dict(
     type="SGD",
     lr=0.001,
+    momentum=0.9,
     weight_decay=0.0001,
 )
 
@@ -26,6 +27,7 @@
 
 evaluation = dict(interval=1, metric="mAP", save_best="mAP")
 early_stop_metric = "mAP"
+
 custom_hooks = [
     dict(
         type="LazyEarlyStoppingHook",

From d33f48df2924067cabeccfe389a19a5350b418d1 Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Wed, 8 Nov 2023 16:40:16 +0000
Subject: [PATCH 12/16] fix pre-commit

---
 .../algorithms/common/adapters/mmcv/configurer.py |  1 +
 .../detection/adapters/mmdet/apis/train.py        | 15 +++++++++------
 .../adapters/mmdet/evaluation/evaluator.py        | 14 +++++++++-----
 .../detectors/custom_single_stage_detector.py     |  1 +
 .../adapters/mmdet/models/heads/__init__.py       |  2 +-
 .../mmdet/models/heads/custom_roi_head.py         |  4 ++--
 .../mmdet/models/heads/custom_rpn_head.py         |  7 +++++--
 .../mmdet/models/heads/custom_ssd_head.py         |  4 +---
 .../mmdet/models/heads/custom_yolox_head.py       |  7 ++-----
 .../configs/detection/configuration.yaml          |  2 +-
 .../configs/detection/mobilenetv2_ssd/model.py    |  1 -
 .../resnet50_maskrcnn/model.py                    |  2 +-
 .../recipes/stages/instance-segmentation/train.py |  1 -
 13 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/src/otx/algorithms/common/adapters/mmcv/configurer.py b/src/otx/algorithms/common/adapters/mmcv/configurer.py
index bfdf3af4b71..83d43192ed7 100644
--- a/src/otx/algorithms/common/adapters/mmcv/configurer.py
+++ b/src/otx/algorithms/common/adapters/mmcv/configurer.py
@@ -184,6 +184,7 @@ def configure_device(self, cfg):
         elif is_xpu_available():
             try:
                 import intel_extension_for_pytorch as ipex  # noqa: F401
+
                 cfg.device = "xpu"
             except ModuleNotFoundError:
                 cfg.device = "cpu"
diff --git a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
index ef13e8d84b3..98ac05d28f5 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
@@ -24,10 +24,9 @@
 from torchvision.ops import nms as tv_nms
 from torchvision.ops import roi_align as tv_roi_align
 
-from otx.algorithms.common.adapters.mmcv.utils import XPUDataParallel, HPUDataParallel
+from otx.algorithms.common.adapters.mmcv.utils import HPUDataParallel, XPUDataParallel
 from otx.algorithms.common.adapters.mmcv.utils.hpu_optimizers import HABANA_OPTIMIZERS
 
-
 ext_module = ext_loader.load_ext("_ext", ["nms", "softnms", "nms_match", "nms_rotated", "nms_quadri"])
 dp_factory["xpu"] = XPUDataParallel
 dp_factory["hpu"] = HPUDataParallel
@@ -123,13 +122,15 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
         model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids, enable_autocast=bool(fp16_cfg))
         model.to(f"xpu:{cfg.gpu_ids[0]}")
     elif cfg.device == "hpu":
-        from habana_frameworks.torch.utils.library_loader import load_habana_module
         import habana_frameworks.torch.core as htcore
+        from habana_frameworks.torch.utils.library_loader import load_habana_module
+
         load_habana_module()
         os.environ["PT_HPU_LAZY_MODE"] = "1"
         assert len(cfg.gpu_ids) == 1
-        model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids, dim=0,
-                         enable_autocast=bool(fp16_cfg), put_gt_on_device=False)
+        model = build_dp(
+            model, cfg.device, device_ids=cfg.gpu_ids, dim=0, enable_autocast=bool(fp16_cfg), put_gt_on_device=False
+        )
         model.to(model.src_device_obj)
         htcore.mark_step()
         model.zero_grad()
@@ -221,8 +222,9 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
         runner.load_checkpoint(cfg.load_from)
     runner.run(data_loaders, cfg.workflow)
 
+
 def patch_optimizer(cfg_optim):
-    "Patch optimizer for OD and IS"
+    """Patch optimizer for OD and IS."""
     if cfg_optim["type"] == "SGD":
         return cfg_optim
 
@@ -232,6 +234,7 @@ def patch_optimizer(cfg_optim):
         del cfg_optim["betas"]
     return cfg_optim
 
+
 def monkey_patched_xpu_nms(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num):
     """Runs MMCVs NMS with torchvision.nms, or forces NMS from MMCV to run on CPU."""
     is_filtering_by_score = score_threshold > 0
diff --git a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
index 715c260d371..f4cad3df5bb 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
@@ -15,18 +15,18 @@
 # and limitations under the License.
 
 import multiprocessing as mp
-from typing import Dict, List, Tuple, Union
 import time
+from typing import Dict, List, Tuple, Union
 
 import mmcv
 import numpy as np
 import pycocotools.mask as mask_util
 from mmcv.utils import print_log
 from mmdet.core import BitmapMasks, PolygonMasks, eval_map
+from mmdet.core.evaluation import mean_ap
 from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
 from mmdet.core.evaluation.class_names import get_classes
 from mmdet.core.evaluation.mean_ap import average_precision
-from mmdet.core.evaluation import mean_ap
 from terminaltables import AsciiTable
 
 from otx.api.entities.label import Domain
@@ -99,16 +99,20 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
                 num_gts[i, j],
                 results[j]["num_dets"],
                 f"{recalls[i, j]:.3f}",
-                f"{aps[i, j]:.3f}"
+                f"{aps[i, j]:.3f}",
             ]
             if segmentation:
                 row_data.append(f"{mious[i, j]:.3f}")
             table_data.append(row_data)
-        table_ = ["mAP", "", "", "", f"{mean_ap[i]:.3f}", f"{np.mean(mious[i]):.3f}"] if segmentation else ["mAP", "", "", "", f"{mean_ap[i]:.3f}"]
+        table_ = (
+            ["mAP", "", "", "", f"{mean_ap[i]:.3f}", f"{np.mean(mious[i]):.3f}"]
+            if segmentation
+            else ["mAP", "", "", "", f"{mean_ap[i]:.3f}"]
+        )
         table_data.append(table_)
         table = AsciiTable(table_data)
         table.inner_footing_row_border = True
-        time.sleep(0.1) # prevent segmentation fault
+        time.sleep(0.1)  # prevent segmentation fault
         print_log("\n" + table.table, logger=logger)
 
 
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
index e5587270545..a8e926cae5d 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
@@ -8,6 +8,7 @@
 import torch
 from mmdet.models.builder import DETECTORS
 from mmdet.models.detectors.single_stage import SingleStageDetector
+
 from otx.algorithms.common.adapters.mmcv.hooks.recording_forward_hook import (
     FeatureVectorHook,
 )
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py
index a0a410f3035..e705d18bdc8 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py
@@ -10,11 +10,11 @@
 from .custom_fcn_mask_head import CustomFCNMaskHead
 from .custom_retina_head import CustomRetinaHead
 from .custom_roi_head import CustomRoIHead
+from .custom_rpn_head import CustomRPNHead
 from .custom_ssd_head import CustomSSDHead
 from .custom_vfnet_head import CustomVFNetHead
 from .custom_yolox_head import CustomYOLOXHead
 from .detr_head import DETRHeadExtension
-from .custom_rpn_head import CustomRPNHead
 
 __all__ = [
     "CrossDatasetDetectorHead",
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
index 45da297ca82..d8e546a5f91 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
@@ -134,10 +134,10 @@ def get_targets(self, sampling_results, gt_bboxes, gt_labels, img_metas, rcnn_tr
         return labels, label_weights, bbox_targets, bbox_weights, valid_label_mask
 
     def forward(self, x):
-        '''ConvFCBBoxHead forward'''
+        """ConvFCBBoxHead forward."""
         # shared part
         cls_score, bbox_pred = super().forward(x)
-        if cls_score.device.type == 'hpu':
+        if cls_score.device.type == "hpu":
             cls_score = cls_score.cpu()
             bbox_pred = bbox_pred.cpu()
 
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py
index 6d7614bb844..4f73b1b9511 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py
@@ -1,6 +1,9 @@
-import torch.nn.functional as F
-from mmdet.models.dense_heads import RPNHead
+"""Custom ROI head for OTX template."""
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
 from mmdet.models.builder import HEADS
+from mmdet.models.dense_heads import RPNHead
 
 
 @HEADS.register_module()
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py
index 1ab2e6a8bbe..7aebbcb3173 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py
@@ -13,7 +13,6 @@
 from mmdet.models.dense_heads.ssd_head import SSDHead
 from mmdet.models.losses import smooth_l1_loss
 from torch import nn
-import time
 
 from otx.algorithms.detection.adapters.mmdet.models.heads.cross_dataset_detector_head import TrackingLossDynamicsMixIn
 from otx.algorithms.detection.adapters.mmdet.models.loss_dyns import (
@@ -100,8 +99,7 @@ def forward(self, feats):
         """
         cls_scores = []
         bbox_preds = []
-        for feat, reg_conv, cls_conv in zip(feats, self.reg_convs,
-                                            self.cls_convs):
+        for feat, reg_conv, cls_conv in zip(feats, self.reg_convs, self.cls_convs):
             cls_out = cls_conv(feat)
             reg_out = reg_conv(feat)
             if cls_out.device.type == "hpu":
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
index e47d891ea48..8b12ae88dd2 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
@@ -43,7 +43,6 @@ def loss(self, cls_scores, bbox_preds, objectnesses, gt_bboxes, gt_labels, img_m
             gt_bboxes_ignore (None | list[Tensor]): specify which bounding
                 boxes can be ignored when computing the loss.
         """
-
         num_imgs = len(img_metas)
         featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
         mlvl_priors = self.prior_generator.grid_priors(
@@ -104,11 +103,9 @@ def loss(self, cls_scores, bbox_preds, objectnesses, gt_bboxes, gt_labels, img_m
 
         return loss_dict
 
-    def forward_single(self, x, cls_convs, reg_convs, conv_cls, conv_reg,
-                       conv_obj):
+    def forward_single(self, x, cls_convs, reg_convs, conv_cls, conv_reg, conv_obj):
         """Forward feature of a single scale level."""
-        cls_score, bbox_pred, objectness = super().forward_single(x, cls_convs, reg_convs, conv_cls, conv_reg,
-                               conv_obj)
+        cls_score, bbox_pred, objectness = super().forward_single(x, cls_convs, reg_convs, conv_cls, conv_reg, conv_obj)
         if cls_score.device.type == "hpu":
             # put on cpu for further post-processing
             cls_score = cls_score.cpu()
diff --git a/src/otx/algorithms/detection/configs/detection/configuration.yaml b/src/otx/algorithms/detection/configs/detection/configuration.yaml
index 7fb866e16df..d36b0d941bc 100644
--- a/src/otx/algorithms/detection/configs/detection/configuration.yaml
+++ b/src/otx/algorithms/detection/configs/detection/configuration.yaml
@@ -129,7 +129,7 @@ learning_parameters:
     warning: null
   enable_early_stopping:
     affects_outcome_of: TRAINING
-    default_value: false
+    default_value: true
     description: Early exit from training when validation accuracy isn't changed or decreased for several epochs.
     editable: true
     header: Enable early stopping of the training
diff --git a/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/model.py b/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/model.py
index bc4eff90aa2..45847b0b80c 100644
--- a/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/model.py
+++ b/src/otx/algorithms/detection/configs/detection/mobilenetv2_ssd/model.py
@@ -96,5 +96,4 @@
 /models/object_detection/v2/mobilenet_v2-2s_ssd-992x736.pth"
 
 fp16 = dict(loss_scale=512.0)
-# fp16 = None
 ignore = False
diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py b/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py
index d0e51ef26c9..d8918edc33f 100644
--- a/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py
+++ b/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py
@@ -159,5 +159,5 @@
 v2.0/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco/\
 mask_rcnn_r50_fpn_mstrain-poly_3x_coco_20210524_201154-21b550bb.pth"
 
-evaluation = dict(interval=100, metric="mAP", save_best="mAP", iou_thr=[0.5])
+evaluation = dict(interval=1, metric="mAP", save_best="mAP", iou_thr=[0.5])
 ignore = True
diff --git a/src/otx/recipes/stages/instance-segmentation/train.py b/src/otx/recipes/stages/instance-segmentation/train.py
index 12e2110c7f8..0ac963fa94d 100644
--- a/src/otx/recipes/stages/instance-segmentation/train.py
+++ b/src/otx/recipes/stages/instance-segmentation/train.py
@@ -7,7 +7,6 @@
 ]
 
 optimizer = dict(
-    type="SGD",
     lr=0.001,
     momentum=0.9,
     weight_decay=0.0001,

From 6eaf641d2b69e504e2379977f9488692b3f3646e Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Wed, 8 Nov 2023 16:43:01 +0000
Subject: [PATCH 13/16] minor

---
 .../detection/adapters/mmdet/models/heads/custom_rpn_head.py    | 2 +-
 .../detection/adapters/mmdet/models/heads/custom_yolox_head.py  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py
index 4f73b1b9511..b5bb4184fe3 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py
@@ -1,4 +1,4 @@
-"""Custom ROI head for OTX template."""
+"""Custom RPN head for OTX template."""
 # Copyright (C) 2022 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
index 8b12ae88dd2..161d692e4f4 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
@@ -229,6 +229,7 @@ def loss(self, cls_scores, bbox_preds, objectnesses, gt_bboxes, gt_labels, img_m
         if self.use_l1:
             loss_l1 = self.loss_l1(flatten_bbox_preds.view(-1, 4)[pos_masks], l1_targets) / num_total_samples
             loss_dict.update(loss_l1=loss_l1)
+
         return loss_dict
 
     @torch.no_grad()

From ecf1b43044faf952e930723fa7d6d3404821d18e Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Wed, 8 Nov 2023 17:10:10 +0000
Subject: [PATCH 14/16] change cast of bf16

---
 .../common/adapters/mmcv/hooks/recording_forward_hook.py     | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py b/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py
index d4df8bbbc22..a3b2698babb 100644
--- a/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py
+++ b/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py
@@ -23,6 +23,7 @@
 from torch.nn import LayerNorm
 
 from otx.algorithms.classification import MMCLS_AVAILABLE
+from otx.algorithms.common.utils.utils import cast_bf16_to_fp32
 
 if MMCLS_AVAILABLE:
     from mmcls.models.necks.gap import GlobalAveragePooling
@@ -74,9 +75,7 @@ def _recording_forward(
     ):  # pylint: disable=unused-argument
         tensors = self.func(output)
         if isinstance(tensors, torch.Tensor):
-            if tensors.dtype == torch.bfloat16:
-                tensors = tensors.to(torch.float32)
-            tensors_np = tensors.detach().cpu().numpy()
+            tensors_np = cast_bf16_to_fp32(tensors).detach().cpu().numpy()
         elif isinstance(tensors, np.ndarray):
             tensors_np = tensors
         else:

From 496c92f08881bb8a5ae54b1d61289a0ad8d734ac Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Thu, 9 Nov 2023 14:23:03 +0000
Subject: [PATCH 15/16] reply comments

---
 .../utils/_builder_build_data_parallel.py     |  2 +
 src/otx/algorithms/common/utils/__init__.py   |  2 +
 .../detection/adapters/mmdet/apis/train.py    | 47 +++++--------------
 .../adapters/mmdet/evaluation/evaluator.py    |  5 +-
 .../detection/adapters/mmdet/task.py          | 10 ++--
 5 files changed, 23 insertions(+), 43 deletions(-)

diff --git a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
index d20cd540ed1..226e5e8cc25 100644
--- a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
+++ b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
@@ -154,6 +154,8 @@ def scatter(self, inputs, kwargs, device_ids):
                 for val in x:
                     if isinstance(val, dict):
                         for k in val:
+                            # don't put annotations on the HPU to proceed
+                            # post-processing on the CPU
                             if not self.put_gt_on_device and k.startswith("gt_"):
                                 continue
                             if isinstance(val[k], torch.Tensor):
diff --git a/src/otx/algorithms/common/utils/__init__.py b/src/otx/algorithms/common/utils/__init__.py
index 6395bd6e60d..5e8b55a0171 100644
--- a/src/otx/algorithms/common/utils/__init__.py
+++ b/src/otx/algorithms/common/utils/__init__.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions
 # and limitations under the License.
 
+import os
 from .callback import (
     InferenceProgressCallback,
     OptimizationProgressCallback,
@@ -59,4 +60,5 @@
 
 
 if is_hpu_available():
+    os.environ["PT_HPU_LAZY_MODE"] = "1"
     import habana_frameworks.torch.gpu_migration  # noqa: F401
diff --git a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
index 98ac05d28f5..4565631880d 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
@@ -122,31 +122,26 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
         model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids, enable_autocast=bool(fp16_cfg))
         model.to(f"xpu:{cfg.gpu_ids[0]}")
     elif cfg.device == "hpu":
-        import habana_frameworks.torch.core as htcore
-        from habana_frameworks.torch.utils.library_loader import load_habana_module
-
-        load_habana_module()
-        os.environ["PT_HPU_LAZY_MODE"] = "1"
-        assert len(cfg.gpu_ids) == 1
         model = build_dp(
             model, cfg.device, device_ids=cfg.gpu_ids, dim=0, enable_autocast=bool(fp16_cfg), put_gt_on_device=False
         )
-        model.to(model.src_device_obj)
-        htcore.mark_step()
-        model.zero_grad()
+        # patch optimizer
+        if (new_type := "Fused" + cfg.optimizer.get("type", "SGD")) in HABANA_OPTIMIZERS:
+            cfg.optimizer["type"] = new_type
     else:
         model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids)
 
     # build optimizer
     auto_scale_lr(cfg, distributed, logger)
-    if cfg.device == "hpu":
-        cfg.optimizer = patch_optimizer(cfg.optimizer)
+
+    if cfg.device in ["hpu", "xpu"]:
+        # dynamic patch for nms and roi_align
+        NMSop.forward = monkey_patched_nms
+        RoIAlign.forward = monkey_patched_roi_align
+
     optimizer = build_optimizer(model, cfg.optimizer)
 
     if cfg.device == "xpu":
-        # dynamic patch for nms and roi_align
-        NMSop.forward = monkey_patched_xpu_nms
-        RoIAlign.forward = monkey_patched_xpu_roi_align
         if fp16_cfg is not None:
             dtype = torch.bfloat16
         else:
@@ -154,14 +149,6 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
         model.train()
         model, optimizer = torch.xpu.optimize(model, optimizer=optimizer, dtype=dtype)
 
-    if cfg.device == "hpu":
-        NMSop.forward = monkey_patched_xpu_nms
-        RoIAlign.forward = monkey_patched_xpu_roi_align
-        # build runner
-        if cfg.device == "hpu":
-            if (new_type := "Fused" + cfg.optimizer.get("type", "SGD")) in HABANA_OPTIMIZERS:
-                cfg.optimizer["type"] = new_type
-
     runner = build_runner(
         cfg.runner, default_args=dict(model=model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)
     )
@@ -223,19 +210,7 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
     runner.run(data_loaders, cfg.workflow)
 
 
-def patch_optimizer(cfg_optim):
-    """Patch optimizer for OD and IS."""
-    if cfg_optim["type"] == "SGD":
-        return cfg_optim
-
-    # Only SGD for OD and IS supported by now on HPU
-    cfg_optim["type"] = "SGD"
-    if "betas" in cfg_optim:
-        del cfg_optim["betas"]
-    return cfg_optim
-
-
-def monkey_patched_xpu_nms(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num):
+def monkey_patched_nms(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num):
     """Runs MMCVs NMS with torchvision.nms, or forces NMS from MMCV to run on CPU."""
     is_filtering_by_score = score_threshold > 0
     if is_filtering_by_score:
@@ -265,7 +240,7 @@ def monkey_patched_xpu_nms(ctx, bboxes, scores, iou_threshold, offset, score_thr
     return inds
 
 
-def monkey_patched_xpu_roi_align(self, input, rois):
+def monkey_patched_roi_align(self, input, rois):
     """Replaces MMCVs roi align with the one from torchvision.
 
     Args:
diff --git a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
index f4cad3df5bb..96b42d05a2b 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
@@ -31,7 +31,7 @@
 
 from otx.api.entities.label import Domain
 from otx.api.utils.time_utils import timeit
-
+from otx.algorithms.common.utils.utils import is_hpu_available
 
 def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
     mean_ap, results, dataset=None, scale_ranges=None, logger=None
@@ -112,7 +112,8 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
         table_data.append(table_)
         table = AsciiTable(table_data)
         table.inner_footing_row_border = True
-        time.sleep(0.1)  # prevent segmentation fault
+        if is_hpu_available():
+            time.sleep(0.1)  # prevent segmentation fault
         print_log("\n" + table.table, logger=logger)
 
 
diff --git a/src/otx/algorithms/detection/adapters/mmdet/task.py b/src/otx/algorithms/detection/adapters/mmdet/task.py
index 3b8040408be..53c00ceaf32 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/task.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/task.py
@@ -44,8 +44,8 @@
 from otx.algorithms.common.utils.data import get_dataset
 from otx.algorithms.common.utils.logger import get_logger
 from otx.algorithms.detection.adapters.mmdet.apis.train import (
-    monkey_patched_xpu_nms,
-    monkey_patched_xpu_roi_align,
+    monkey_patched_nms,
+    monkey_patched_roi_align,
     train_detector,
 )
 from otx.algorithms.detection.adapters.mmdet.configurer import (
@@ -348,9 +348,9 @@ def _infer_model(
         else:
             target_classes = mm_dataset.CLASSES
 
-        if cfg.device == "xpu":
-            NMSop.forward = monkey_patched_xpu_nms
-            RoIAlign.forward = monkey_patched_xpu_roi_align
+        if cfg.device in ["xpu", "hpu"]:
+            NMSop.forward = monkey_patched_nms
+            RoIAlign.forward = monkey_patched_roi_align
 
         # Model
         model = self.build_model(cfg, fp16=cfg.get("fp16", False))

From e85d681efb2eace8a8b6a14bd8e0a7d70d96236a Mon Sep 17 00:00:00 2001
From: eunwoosh <eunwoo.shin@intel.com>
Date: Fri, 10 Nov 2023 09:39:40 +0900
Subject: [PATCH 16/16] align with pre-commit

---
 src/otx/algorithms/common/utils/__init__.py                    | 1 +
 .../detection/adapters/mmdet/evaluation/evaluator.py           | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/otx/algorithms/common/utils/__init__.py b/src/otx/algorithms/common/utils/__init__.py
index 5e8b55a0171..80372c59b4b 100644
--- a/src/otx/algorithms/common/utils/__init__.py
+++ b/src/otx/algorithms/common/utils/__init__.py
@@ -15,6 +15,7 @@
 # and limitations under the License.
 
 import os
+
 from .callback import (
     InferenceProgressCallback,
     OptimizationProgressCallback,
diff --git a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
index 96b42d05a2b..36bda12206f 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
@@ -29,9 +29,10 @@
 from mmdet.core.evaluation.mean_ap import average_precision
 from terminaltables import AsciiTable
 
+from otx.algorithms.common.utils.utils import is_hpu_available
 from otx.api.entities.label import Domain
 from otx.api.utils.time_utils import timeit
-from otx.algorithms.common.utils.utils import is_hpu_available
+
 
 def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
     mean_ap, results, dataset=None, scale_ranges=None, logger=None