diff --git a/src/otx/algorithms/common/adapters/mmcv/configurer.py b/src/otx/algorithms/common/adapters/mmcv/configurer.py
index 54ee9326b80..83d43192ed7 100644
--- a/src/otx/algorithms/common/adapters/mmcv/configurer.py
+++ b/src/otx/algorithms/common/adapters/mmcv/configurer.py
@@ -176,7 +176,7 @@ def configure_device(self, cfg):
         elif "gpu_ids" not in cfg:
             cfg.gpu_ids = range(1)
 
-        # consider "cuda", "hpu" and "cpu" device only
+        # consider "cuda", "xpu", "hpu" and "cpu" device only
         if is_hpu_available():
             cfg.device = "hpu"
         elif torch.cuda.is_available():
diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py b/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py
index 062cc230367..a3b2698babb 100644
--- a/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py
+++ b/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py
@@ -23,6 +23,7 @@
 from torch.nn import LayerNorm
 
 from otx.algorithms.classification import MMCLS_AVAILABLE
+from otx.algorithms.common.utils.utils import cast_bf16_to_fp32
 
 if MMCLS_AVAILABLE:
     from mmcls.models.necks.gap import GlobalAveragePooling
@@ -74,7 +75,7 @@ def _recording_forward(
     ):  # pylint: disable=unused-argument
         tensors = self.func(output)
         if isinstance(tensors, torch.Tensor):
-            tensors_np = tensors.detach().cpu().numpy()
+            tensors_np = cast_bf16_to_fp32(tensors).detach().cpu().numpy()
         elif isinstance(tensors, np.ndarray):
             tensors_np = tensors
         else:
diff --git a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
index 39c9bf5f7b3..226e5e8cc25 100644
--- a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
+++ b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py
@@ -64,7 +64,7 @@ def build_data_parallel(
         model = model.xpu()
         model = XPUDataParallel(model, device_ids=config.gpu_ids)
     elif is_hpu_available() and config.get("gpu_ids", []):
-        model = model.hpu()
+        model = model.to("hpu")
         model = HPUDataParallel(model, device_ids=config.gpu_ids)
     elif torch.cuda.is_available() and config.get("gpu_ids", []):
         if distributed:
@@ -140,9 +140,10 @@ def val_step(self, *inputs, **kwargs):
 
 
 class HPUDataParallel(MMDataParallel):
-    def __init__(self, *args, enable_autocast: bool = False, **kwargs):
+    def __init__(self, *args, enable_autocast: bool = False, put_gt_on_device=True, **kwargs):
         super().__init__(*args, **kwargs)
         self.enable_autocast = enable_autocast
+        self.put_gt_on_device = put_gt_on_device
         self.src_device_obj = torch.device("hpu", self.device_ids[0])
 
     def scatter(self, inputs, kwargs, device_ids):
@@ -153,6 +154,10 @@ def scatter(self, inputs, kwargs, device_ids):
                 for val in x:
                     if isinstance(val, dict):
                         for k in val:
+                            # don't put annotations on the HPU to proceed
+                            # post-processing on the CPU
+                            if not self.put_gt_on_device and k.startswith("gt_"):
+                                continue
                             if isinstance(val[k], torch.Tensor):
                                 val[k] = val[k].to(self.src_device_obj)
                             elif isinstance(val[k], list):
diff --git a/src/otx/algorithms/common/utils/__init__.py b/src/otx/algorithms/common/utils/__init__.py
index 6395bd6e60d..80372c59b4b 100644
--- a/src/otx/algorithms/common/utils/__init__.py
+++ b/src/otx/algorithms/common/utils/__init__.py
@@ -14,6 +14,8 @@
 # See the License for the specific language governing permissions
 # and limitations under the License.
 
+import os
+
 from .callback import (
     InferenceProgressCallback,
     OptimizationProgressCallback,
@@ -59,4 +61,5 @@
 
 
 if is_hpu_available():
+    os.environ["PT_HPU_LAZY_MODE"] = "1"
     import habana_frameworks.torch.gpu_migration  # noqa: F401
diff --git a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
index caf8720b59a..4565631880d 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py
@@ -24,10 +24,12 @@
 from torchvision.ops import nms as tv_nms
 from torchvision.ops import roi_align as tv_roi_align
 
-from otx.algorithms.common.adapters.mmcv.utils import XPUDataParallel
+from otx.algorithms.common.adapters.mmcv.utils import HPUDataParallel, XPUDataParallel
+from otx.algorithms.common.adapters.mmcv.utils.hpu_optimizers import HABANA_OPTIMIZERS
 
 ext_module = ext_loader.load_ext("_ext", ["nms", "softnms", "nms_match", "nms_rotated", "nms_quadri"])
 dp_factory["xpu"] = XPUDataParallel
+dp_factory["hpu"] = HPUDataParallel
 
 
 def auto_scale_lr(cfg, distributed, logger):
@@ -119,17 +121,27 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
     elif cfg.device == "xpu":
         model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids, enable_autocast=bool(fp16_cfg))
         model.to(f"xpu:{cfg.gpu_ids[0]}")
+    elif cfg.device == "hpu":
+        model = build_dp(
+            model, cfg.device, device_ids=cfg.gpu_ids, dim=0, enable_autocast=bool(fp16_cfg), put_gt_on_device=False
+        )
+        # patch optimizer
+        if (new_type := "Fused" + cfg.optimizer.get("type", "SGD")) in HABANA_OPTIMIZERS:
+            cfg.optimizer["type"] = new_type
     else:
         model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids)
 
     # build optimizer
     auto_scale_lr(cfg, distributed, logger)
+
+    if cfg.device in ["hpu", "xpu"]:
+        # dynamic patch for nms and roi_align
+        NMSop.forward = monkey_patched_nms
+        RoIAlign.forward = monkey_patched_roi_align
+
     optimizer = build_optimizer(model, cfg.optimizer)
 
     if cfg.device == "xpu":
-        # dynamic patch for nms and roi_align
-        NMSop.forward = monkey_patched_xpu_nms
-        RoIAlign.forward = monkey_patched_xpu_roi_align
         if fp16_cfg is not None:
             dtype = torch.bfloat16
         else:
@@ -198,7 +210,7 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
     runner.run(data_loaders, cfg.workflow)
 
 
-def monkey_patched_xpu_nms(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num):
+def monkey_patched_nms(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num):
     """Runs MMCVs NMS with torchvision.nms, or forces NMS from MMCV to run on CPU."""
     is_filtering_by_score = score_threshold > 0
     if is_filtering_by_score:
@@ -228,7 +240,7 @@ def monkey_patched_xpu_nms(ctx, bboxes, scores, iou_threshold, offset, score_thr
     return inds
 
 
-def monkey_patched_xpu_roi_align(self, input, rois):
+def monkey_patched_roi_align(self, input, rois):
     """Replaces MMCVs roi align with the one from torchvision.
 
     Args:
diff --git a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
index b6e5e6ab2dd..36bda12206f 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py
@@ -15,6 +15,7 @@
 # and limitations under the License.
 
 import multiprocessing as mp
+import time
 from typing import Dict, List, Tuple, Union
 
 import mmcv
@@ -22,11 +23,13 @@
 import pycocotools.mask as mask_util
 from mmcv.utils import print_log
 from mmdet.core import BitmapMasks, PolygonMasks, eval_map
+from mmdet.core.evaluation import mean_ap
 from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
 from mmdet.core.evaluation.class_names import get_classes
 from mmdet.core.evaluation.mean_ap import average_precision
 from terminaltables import AsciiTable
 
+from otx.algorithms.common.utils.utils import is_hpu_available
 from otx.api.entities.label import Domain
 from otx.api.utils.time_utils import timeit
 
@@ -59,6 +62,7 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
     if scale_ranges is not None:
         assert len(scale_ranges) == num_scales
 
+    segmentation = "miou" in results
     num_classes = len(results)
 
     recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
@@ -69,7 +73,8 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
         if cls_result["recall"].size > 0:
             recalls[:, i] = np.array(cls_result["recall"], ndmin=2)[:, -1]
         aps[:, i] = cls_result["ap"]
-        mious[:, i] = cls_result["miou"]
+        if segmentation:
+            mious[:, i] = cls_result["miou"]
         num_gts[:, i] = cls_result["num_gts"]
 
     if dataset is None:
@@ -82,7 +87,9 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
     if not isinstance(mean_ap, list):
         mean_ap = [mean_ap]
 
-    header = ["class", "gts", "dets", "recall", "ap", "miou"]
+    header = ["class", "gts", "dets", "recall", "ap"]
+    if segmentation:
+        header.append("miou")
     for i in range(num_scales):
         if scale_ranges is not None:
             print_log(f"Scale range {scale_ranges[i]}", logger=logger)
@@ -94,12 +101,20 @@ def print_map_summary(  # pylint: disable=too-many-locals,too-many-branches
                 results[j]["num_dets"],
                 f"{recalls[i, j]:.3f}",
                 f"{aps[i, j]:.3f}",
-                f"{mious[i, j]:.3f}",
             ]
+            if segmentation:
+                row_data.append(f"{mious[i, j]:.3f}")
             table_data.append(row_data)
-        table_data.append(["mAP", "", "", "", f"{mean_ap[i]:.3f}", f"{np.mean(mious[i]):.3f}"])
+        table_ = (
+            ["mAP", "", "", "", f"{mean_ap[i]:.3f}", f"{np.mean(mious[i]):.3f}"]
+            if segmentation
+            else ["mAP", "", "", "", f"{mean_ap[i]:.3f}"]
+        )
+        table_data.append(table_)
         table = AsciiTable(table_data)
         table.inner_footing_row_border = True
+        if is_hpu_available():
+            time.sleep(0.1)  # prevent segmentation fault
         print_log("\n" + table.table, logger=logger)
 
 
@@ -245,6 +260,7 @@ def __init__(self, annotation: List[Dict], domain: Domain, classes: List[str], n
         else:
             self.annotation = annotation
         self.nproc = nproc
+        mean_ap.print_map_summary = print_map_summary
 
     def get_gt_instance_masks(self, annotation: List[Dict]):
         """Format ground truth instance mask annotation.
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
index f690c38d86b..a8e926cae5d 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py
@@ -38,7 +38,6 @@ class CustomSingleStageDetector(SAMDetectorMixin, DetLossDynamicsTrackingMixin,
 
     def __init__(self, *args, task_adapt=None, **kwargs):
         super().__init__(*args, **kwargs)
-
         # Hook for class-sensitive weight loading
         if task_adapt:
             self._register_load_state_dict_pre_hook(
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py
index 28da39d0a1b..e705d18bdc8 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py
@@ -10,6 +10,7 @@
 from .custom_fcn_mask_head import CustomFCNMaskHead
 from .custom_retina_head import CustomRetinaHead
 from .custom_roi_head import CustomRoIHead
+from .custom_rpn_head import CustomRPNHead
 from .custom_ssd_head import CustomSSDHead
 from .custom_vfnet_head import CustomVFNetHead
 from .custom_yolox_head import CustomYOLOXHead
@@ -27,6 +28,7 @@
     "CustomVFNetHead",
     "CustomYOLOXHead",
     "DETRHeadExtension",
+    "CustomRPNHead",
     # Loss dynamics tracking
     "CustomATSSHeadTrackingLossDynamics",
 ]
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py
index 477790e0d4d..41b7fd3aa8b 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py
@@ -49,6 +49,32 @@ def __init__(self, *args, bg_loss_weight=-1.0, use_qfl=False, qfl_cfg=None, **kw
         self.bg_loss_weight = bg_loss_weight
         self.use_qfl = use_qfl
 
+    def forward_single(self, x, scale):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale
+                    level, the channels number is num_anchors * 4.
+                centerness (Tensor): Centerness for a single scale level, the
+                    channel number is (N, num_anchors * 1, H, W).
+        """
+        cls_score, bbox_pred, centerness = super().forward_single(x, scale)
+        if cls_score.device.type == "hpu":
+            # put further post-processing on cpu
+            cls_score = cls_score.cpu()
+            bbox_pred = bbox_pred.cpu()
+            centerness = centerness.cpu()
+
+        return cls_score, bbox_pred, centerness
+
     @force_fp32(apply_to=("cls_scores", "bbox_preds", "centernesses"))
     def loss(self, cls_scores, bbox_preds, centernesses, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None):
         """Compute losses of the head.
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
index 05902fc9e70..d8e546a5f91 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py
@@ -54,6 +54,14 @@ def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels, img_met
         bbox_results.update(loss_bbox=loss_bbox)
         return bbox_results
 
+    def _mask_forward(self, x, rois=None, pos_inds=None, bbox_feats=None):
+        """Mask head forward function used in both training and testing."""
+        mask_results = super()._mask_forward(x, rois, pos_inds, bbox_feats)
+        if mask_results["mask_pred"].device.type == "hpu":
+            mask_results["mask_pred"] = mask_results["mask_pred"].cpu()
+            mask_results["mask_feats"] = mask_results["mask_feats"].cpu()
+        return mask_results
+
 
 @HEADS.register_module()
 class CustomConvFCBBoxHead(Shared2FCBBoxHead, CrossDatasetDetectorHead):
@@ -125,6 +133,16 @@ def get_targets(self, sampling_results, gt_bboxes, gt_labels, img_metas, rcnn_tr
             valid_label_mask = torch.cat(valid_label_mask, 0)
         return labels, label_weights, bbox_targets, bbox_weights, valid_label_mask
 
+    def forward(self, x):
+        """ConvFCBBoxHead forward."""
+        # shared part
+        cls_score, bbox_pred = super().forward(x)
+        if cls_score.device.type == "hpu":
+            cls_score = cls_score.cpu()
+            bbox_pred = bbox_pred.cpu()
+
+        return cls_score, bbox_pred
+
     @force_fp32(apply_to=("cls_score", "bbox_pred"))
     def loss(
         self,
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py
new file mode 100644
index 00000000000..b5bb4184fe3
--- /dev/null
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py
@@ -0,0 +1,25 @@
+"""Custom RPN head for OTX template."""
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+from mmdet.models.builder import HEADS
+from mmdet.models.dense_heads import RPNHead
+
+
+@HEADS.register_module()
+class CustomRPNHead(RPNHead):
+    """RPN head.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+        num_convs (int): Number of convolution layers in the head. Default 1.
+    """
+
+    def forward_single(self, x):
+        """Forward feature map of a single scale level."""
+        rpn_cls_score, rpn_bbox_pred = super().forward_single(x)
+        if rpn_cls_score.device.type == "hpu":
+            rpn_cls_score = rpn_cls_score.cpu()
+            rpn_bbox_pred = rpn_bbox_pred.cpu()
+        return rpn_cls_score, rpn_bbox_pred
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py
index 6d5f1ce8427..7aebbcb3173 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py
@@ -81,6 +81,35 @@ def _init_layers(self):
                     nn.Conv2d(in_channel, num_base_priors * self.cls_out_channels, kernel_size=3, padding=1)
                 )
 
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * 4.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for feat, reg_conv, cls_conv in zip(feats, self.reg_convs, self.cls_convs):
+            cls_out = cls_conv(feat)
+            reg_out = reg_conv(feat)
+            if cls_out.device.type == "hpu":
+                cls_scores.append(cls_out.cpu())
+                bbox_preds.append(reg_out.cpu())
+            else:
+                cls_scores.append(cls_out)
+                bbox_preds.append(reg_out)
+        return cls_scores, bbox_preds
+
     def loss_single(
         self,
         cls_score,
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
index 5de9fc272ff..161d692e4f4 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py
@@ -103,6 +103,16 @@ def loss(self, cls_scores, bbox_preds, objectnesses, gt_bboxes, gt_labels, img_m
 
         return loss_dict
 
+    def forward_single(self, x, cls_convs, reg_convs, conv_cls, conv_reg, conv_obj):
+        """Forward feature of a single scale level."""
+        cls_score, bbox_pred, objectness = super().forward_single(x, cls_convs, reg_convs, conv_cls, conv_reg, conv_obj)
+        if cls_score.device.type == "hpu":
+            # put on cpu for further post-processing
+            cls_score = cls_score.cpu()
+            bbox_pred = bbox_pred.cpu()
+            objectness = objectness.cpu()
+        return cls_score, bbox_pred, objectness
+
 
 @HEADS.register_module()
 class CustomYOLOXHeadTrackingLossDynamics(TrackingLossDynamicsMixIn, CustomYOLOXHead):
@@ -245,6 +255,7 @@ def _get_target_single(self, cls_preds, objectness, priors, decoded_bboxes, gt_b
         num_priors = priors.size(0)
         num_gts = gt_labels.size(0)
         gt_bboxes = gt_bboxes.to(decoded_bboxes.dtype)
+
         # No target
         if num_gts == 0:
             cls_target = cls_preds.new_zeros((0, self.num_classes))
diff --git a/src/otx/algorithms/detection/adapters/mmdet/task.py b/src/otx/algorithms/detection/adapters/mmdet/task.py
index 3b8040408be..53c00ceaf32 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/task.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/task.py
@@ -44,8 +44,8 @@
 from otx.algorithms.common.utils.data import get_dataset
 from otx.algorithms.common.utils.logger import get_logger
 from otx.algorithms.detection.adapters.mmdet.apis.train import (
-    monkey_patched_xpu_nms,
-    monkey_patched_xpu_roi_align,
+    monkey_patched_nms,
+    monkey_patched_roi_align,
     train_detector,
 )
 from otx.algorithms.detection.adapters.mmdet.configurer import (
@@ -348,9 +348,9 @@ def _infer_model(
         else:
             target_classes = mm_dataset.CLASSES
 
-        if cfg.device == "xpu":
-            NMSop.forward = monkey_patched_xpu_nms
-            RoIAlign.forward = monkey_patched_xpu_roi_align
+        if cfg.device in ["xpu", "hpu"]:
+            NMSop.forward = monkey_patched_nms
+            RoIAlign.forward = monkey_patched_roi_align
 
         # Model
         model = self.build_model(cfg, fp16=cfg.get("fp16", False))
diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/model.py b/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/model.py
index 72ca9481ef3..03cb21733dc 100644
--- a/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/model.py
+++ b/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/model.py
@@ -28,7 +28,7 @@
     type="CustomMaskRCNN",  # Use CustomMaskRCNN for Incremental Learning
     neck=dict(type="FPN", in_channels=[24, 48, 120, 352], out_channels=80, num_outs=5),
     rpn_head=dict(
-        type="RPNHead",
+        type="CustomRPNHead",
         in_channels=80,
         feat_channels=80,
         anchor_generator=dict(type="AnchorGenerator", scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]),
diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py b/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py
index 66f7522bdee..203470d2fac 100644
--- a/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py
+++ b/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py
@@ -38,7 +38,7 @@
     ),
     neck=dict(type="FPN", in_channels=[96, 192, 384, 768], out_channels=256, num_outs=5),
     rpn_head=dict(
-        type="RPNHead",
+        type="CustomRPNHead",
         in_channels=256,
         feat_channels=256,
         anchor_generator=dict(type="AnchorGenerator", scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]),
diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py b/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py
index 6832028e425..d8918edc33f 100644
--- a/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py
+++ b/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py
@@ -33,7 +33,7 @@
         num_outs=5,
     ),
     rpn_head=dict(
-        type="RPNHead",
+        type="CustomRPNHead",
         in_channels=256,
         feat_channels=256,
         anchor_generator=dict(