diff --git a/.ci/docker/Dockerfile b/.ci/docker/Dockerfile
index a0b137efe8d..cb591488a76 100644
--- a/.ci/docker/Dockerfile
+++ b/.ci/docker/Dockerfile
@@ -5,16 +5,9 @@
 FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04@sha256:c1869c30f46fff478a37ed58d9dace7e08519541274f03424d0b78bd35b2c73a AS python_base_cuda
 LABEL maintainer="OpenVINO Training Extensions Development Team"
 
-ARG HTTP_PROXY
-ARG HTTPS_PROXY
-ARG NO_PROXY
 ARG uid
 ARG gid
 
-# Setup proxies
-ENV http_proxy=$HTTP_PROXY
-ENV https_proxy=$HTTPS_PROXY
-ENV no_proxy=$NO_PROXY
 ENV DEBIAN_FRONTEND="noninteractive"
 
 # hadolint ignore=DL3008
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 063c765b4ac..28d1c40f3f8 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -53,9 +53,6 @@ fi
 TAG=$1
 
 docker build -f ./Dockerfile \
---build-arg HTTP_PROXY="${http_proxy:?}" \
---build-arg HTTPS_PROXY="${https_proxy:?}" \
---build-arg NO_PROXY="${no_proxy:?}" \
 --build-arg ACTIONS_RUNNER_VER="$ACTIONS_RUNNER_VER" \
 --build-arg gid="$(id -g)" \
 --build-arg uid="$UID" \
diff --git a/.ci/docker/start-runner.sh b/.ci/docker/start-runner.sh
index 2e38efcd0d3..dd1fbf5cc31 100755
--- a/.ci/docker/start-runner.sh
+++ b/.ci/docker/start-runner.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 GPU_ID="all"
-VER_CUDA="11.7.1"
+VER_CUDA="12.1.0"
 TAG_RUNNER="latest"
 ADDITIONAL_LABELS=""
 MOUNT_PATH=""
@@ -149,9 +149,6 @@ if [ "$DEBUG_CONTAINER" = true ]; then
         --name "$CONTAINER_NAME" \
         -e NVIDIA_VISIBLE_DEVICES="$GPU_ID" \
         ${ENV_FLAGS} \
-        -e http_proxy=http://proxy-chain.intel.com:911 \
-        -e https_proxy=http://proxy-chain.intel.com:912 \
-        -e no_proxy=intel.com,.intel.com,localhost,127.0.0.0/8 \
         ${MOUNT_FLAGS} \
         ${CACHE_MOUNT_FLAGS} \
         "$DOCKER_REG_ADDR"/ote/ci/cu"$VER_CUDA"/runner:"$TAG_RUNNER"; RET=$?
@@ -172,9 +169,6 @@ else
         --name "$CONTAINER_NAME" \
         -e NVIDIA_VISIBLE_DEVICES="$GPU_ID" \
         ${ENV_FLAGS} \
-        -e http_proxy=http://proxy-chain.intel.com:911 \
-        -e https_proxy=http://proxy-chain.intel.com:912 \
-        -e no_proxy=intel.com,.intel.com,localhost,127.0.0.0/8 \
         ${MOUNT_FLAGS} \
         ${CACHE_MOUNT_FLAGS} \
         "$DOCKER_REG_ADDR"/ote/ci/cu"$VER_CUDA"/runner:"$TAG_RUNNER"; RET=$?
diff --git a/.github/workflows/pre_merge.yaml b/.github/workflows/pre_merge.yaml
index b84dbb2b93e..f1964e4b64b 100644
--- a/.github/workflows/pre_merge.yaml
+++ b/.github/workflows/pre_merge.yaml
@@ -101,9 +101,6 @@ jobs:
           - task: "multi_cls_classification"
           - task: "multi_label_classification"
           - task: "hlabel_classification"
-          - task: "detection"
-          - task: "instance_segmentation"
-          - task: "semantic_segmentation"
           - task: "visual_prompting"
           - task: "zero_shot_visual_prompting"
           - task: "anomaly_classification"
@@ -127,3 +124,32 @@ jobs:
           rm /tmp/requirements.txt
       - name: Run Integration Test
         run: tox -vv -e integration-test-${{ matrix.task }}
+  Integration-Test-Large:
+    if: |
+      github.event.pull_request.draft == false &&
+      !(startsWith(github.event.pull_request.title, '[WIP]'))
+    runs-on: [self-hosted, linux, x64, dev, dmount]
+    needs: Unit-Test
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - task: "detection"
+          - task: "instance_segmentation"
+          - task: "semantic_segmentation"
+    name: Integration-Test-Large-${{ matrix.task }}-py310
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      - name: Install Python
+        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        with:
+          python-version: "3.10"
+      - name: Install tox
+        run: |
+          python -m pip install --require-hashes --no-deps -r .ci/requirements.txt
+          pip-compile --generate-hashes --output-file=/tmp/requirements.txt --extra=ci_tox pyproject.toml
+          python -m pip install --require-hashes --no-deps -r /tmp/requirements.txt
+          rm /tmp/requirements.txt
+      - name: Run Integration Test
+        run: tox -vv -e integration-test-${{ matrix.task }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 15130d89995..f39500d2c8f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,78 +7,82 @@ All notable changes to this project will be documented in this file.
 ### New features
 
 - Turn on/off classification augmentations
-  (https://github.com/openvinotoolkit/training_extensions/pull/4039)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4039>)
 
 ### Enhancements
 
 - Update visual prompting pipeline for multi-label zero-shot learning support
-  (https://github.com/openvinotoolkit/training_extensions/pull/3993)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3993>)
 - Update to work torch compile in detection
-  (https://github.com/openvinotoolkit/training_extensions/pull/4003)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4003>)
 - Refactor MaskDINO
-  (https://github.com/openvinotoolkit/training_extensions/pull/4006)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4006>)
 - Fix MaskRCNN/RTMDet-Inst/MaskRCNNTV Explain Mode
-  (https://github.com/openvinotoolkit/training_extensions/pull/4053)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4053>)
 
 ## \[2.3.0\]
 
 ### New features
 
 - Add YOLOv9 model for Object Detection
-  (https://github.com/openvinotoolkit/training_extensions/pull/3917)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3917>, <https://github.com/openvinotoolkit/training_extensions/pull/4026>)
 - Add OV inference for keypoint detection
-  (https://github.com/openvinotoolkit/training_extensions/pull/3970)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3970>)
 - Add tiling for semantic segmentation
-  (https://github.com/openvinotoolkit/training_extensions/pull/3954)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3954>)
 - Add 3D Object Detection task with MonoDETR model
-  (https://github.com/openvinotoolkit/training_extensions/pull/3979)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3979>)
+- Add OpenVINO inference for 3D Object Detection task
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4017>)
 
 ### Enhancements
 
 - Upgrade OV, MAPI, and NNCF dependencies
-  (https://github.com/openvinotoolkit/training_extensions/pull/3967)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3967>)
 - Instance Segmentation Model refactoring
-  (https://github.com/openvinotoolkit/training_extensions/pull/3865)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3865>)
 - Bump torch and lightning to 2.4.0 versions
-  (https://github.com/openvinotoolkit/training_extensions/pull/3843)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3843>)
 - Add mAP metric to evaluate multilabel classification
-  (https://github.com/openvinotoolkit/training_extensions/pull/3985)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3985>)
 
 ### Bug fixes
 
 - Fix a wrong HPO log
-  (https://github.com/openvinotoolkit/training_extensions/pull/3972)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3972>)
+- Update model name in rotated detection recipes
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4028>)
 
 ## \[2.2.0\]
 
 ### New features
 
 - Add RT-DETR model for Object Detection
-  (https://github.com/openvinotoolkit/training_extensions/pull/3741)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3741>)
 - Add Multi-Label & H-label Classification with torchvision models
-  (https://github.com/openvinotoolkit/training_extensions/pull/3697)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3697>)
 - Add Hugging-Face Model Wrapper for Classification
-  (https://github.com/openvinotoolkit/training_extensions/pull/3710)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3710>)
 - Add LoRA finetuning capability for ViT Architectures
-  (https://github.com/openvinotoolkit/training_extensions/pull/3729)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3729>)
 - Add Hugging-Face Model Wrapper for Object Detection
-  (https://github.com/openvinotoolkit/training_extensions/pull/3747)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3747>)
 - Add Hugging-Face Model Wrapper for Semantic Segmentation
-  (https://github.com/openvinotoolkit/training_extensions/pull/3749)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3749>)
 - Enable torch.compile to work with classification
-  (https://github.com/openvinotoolkit/training_extensions/pull/3758)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3758>)
 - Add `otx benchmark` subcommand
-  (https://github.com/openvinotoolkit/training_extensions/pull/3762)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3762>)
 - Add RTMPose for Keypoint Detection Task
-  (https://github.com/openvinotoolkit/training_extensions/pull/3781)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3781>, <https://github.com/openvinotoolkit/training_extensions/pull/4034>)
 - Add Semi-SL MeanTeacher algorithm for Semantic Segmentation
-  (https://github.com/openvinotoolkit/training_extensions/pull/3801)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3801>)
 - Update head and h-label format for hierarchical label classification
-  (https://github.com/openvinotoolkit/training_extensions/pull/3810)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3810>)
 - Support configurable input size
-  (https://github.com/openvinotoolkit/training_extensions/pull/3788)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3788>)
 - Add diffusion task
-  (https://github.com/openvinotoolkit/training_extensions/pull/3875)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3875>)
 
 ### Enhancements
 
@@ -106,9 +110,21 @@ All notable changes to this project will be documented in this file.
   (<https://github.com/openvinotoolkit/training_extensions/pull/3968>)
 - Change sematic segmentation to consider bbox only annotations
   (<https://github.com/openvinotoolkit/training_extensions/pull/3996>)
+- Relieve memory usage criteria on batch size 2 during adaptive batch size
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4009>)
+- Remove background label from RT Info for segmentation task
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4011>)
+- Prevent using too low confidence thresholds in detection
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4018>)
+- Update HPO interface
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4035>)
+- Bump onnx to 1.17.0 to omit CVE-2024-5187
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4063>)
 
 ### Bug fixes
 
+- Update anomaly base transforms to use square resizing
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4059>)
 - Fix Combined Dataloader & unlabeled warmup loss in Semi-SL
   (<https://github.com/openvinotoolkit/training_extensions/pull/3723>)
 - Revert #3579 to fix issues with replacing coco_instance with a different format in some dataset
@@ -121,6 +137,30 @@ All notable changes to this project will be documented in this file.
   (<https://github.com/openvinotoolkit/training_extensions/pull/3946>)
 - Fix config converter for tiling
   (<https://github.com/openvinotoolkit/training_extensions/pull/3973>)
+- Fix `BboxOverlaps2D` handling of empty ground-truth annotations in datasets.
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4010>)
+- Fix num_trials calculation on dataset length less than num_class
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4014>)
+- Fix out_features in HierarchicalCBAMClsHead
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4016>)
+- Fix multilabel_accuracy of MixedHLabelAccuracy
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4042>)
+- Fix wrong indices setting in HLabelInfo
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4044>)
+- Add legacy template LiteHRNet_18 template
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4049>)
+- Model templates: rename model_status value 'DISCONTINUED' to 'OBSOLETE'
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4051>)
+- Enable export of feature vectors for semantic segmentation task
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4055>)
+- Update MRCNN model export to include feature vector and saliency map
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4056>)
+- Upgrade MAPI in 2.2
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4052>)
+- Fix applying model's hparams when loading model from checkpoint
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4057>)
+- Fix incorrect all_groups order configuration in HLabelInfo
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4067>)
 
 ## \[v2.1.0\]
 
diff --git a/README.md b/README.md
index f42741bb689..435415abb0f 100644
--- a/README.md
+++ b/README.md
@@ -197,6 +197,9 @@ In addition to the examples above, please refer to the documentation for tutoria
 - Include full image with anno in case there's no tile in tile dataset
 - Add type checker in converter for callable functions (optimizer, scheduler)
 - Change sematic segmentation to consider bbox only annotations
+- Relieve memory usage criteria on batch size 2 during adaptive batch size
+- Remove background label from RT Info for segmentation task
+- Prevent using too low confidence thresholds in detection
 
 ### Bug fixes
 
@@ -206,6 +209,10 @@ In addition to the examples above, please refer to the documentation for tutoria
 - Add missing tile recipes and various tile recipe changes
 - Change categories mapping logic
 - Fix config converter for tiling
+- Fix num_trials calculation on dataset length less than num_class
+- Fix out_features in HierarchicalCBAMClsHead
+- Fix multilabel_accuracy of MixedHLabelAccuracy
+- Fix wrong indices setting in HLabelInfo
 
 ### Known issues
 
diff --git a/docs/source/guide/release_notes/index.rst b/docs/source/guide/release_notes/index.rst
index a0e0954c2a8..e0b8dc86383 100644
--- a/docs/source/guide/release_notes/index.rst
+++ b/docs/source/guide/release_notes/index.rst
@@ -4,7 +4,7 @@ Releases
 .. toctree::
   :maxdepth: 1
 
-v2.2.0 (2024.09)
+v2.2.0 (2024.10)
 ----------------
 
 New features
@@ -38,6 +38,9 @@ Enhancements
 - Include full image with anno in case there's no tile in tile dataset
 - Add type checker in converter for callable functions (optimizer, scheduler)
 - Change sematic segmentation to consider bbox only annotations
+- Relieve memory usage criteria on batch size 2 during adaptive batch size
+- Remove background label from RT Info for segmentation task
+- Prevent using too low confidence thresholds in detection
 
 Bug fixes
 ^^^^^^^^^
@@ -48,6 +51,10 @@ Bug fixes
 - Add missing tile recipes and various tile recipe changes
 - Change categories mapping logic
 - Fix config converter for tiling
+- Fix num_trials calculation on dataset length less than num_class
+- Fix out_features in HierarchicalCBAMClsHead
+- Fix multilabel_accuracy of MixedHLabelAccuracy
+- Fix wrong indices setting in HLabelInfo
 
 v2.1.0 (2024.07)
 ----------------
diff --git a/pyproject.toml b/pyproject.toml
index c9eba50c14f..d32427f7149 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,8 +81,8 @@ xpu = [
     "timm==1.0.3",
     "openvino==2024.4",
     "openvino-dev==2024.4",
-    "openvino-model-api==0.2.4",
-    "onnx==1.16.2",
+    "openvino-model-api==0.2.5",
+    "onnx==1.17.0",
     "onnxconverter-common==1.14.0",
     "nncf==2.13.0",
     "anomalib[core]==1.1.0",
@@ -96,7 +96,7 @@ base = [
     "openvino==2024.4",
     "openvino-dev==2024.4",
     "openvino-model-api==0.2.4",
-    "onnx==1.16.2",
+    "onnx==1.17.0",
     "onnxconverter-common==1.14.0",
     "nncf==2.13.0",
     "anomalib[core]==1.1.0",
diff --git a/src/otx/algo/classification/heads/hlabel_cls_head.py b/src/otx/algo/classification/heads/hlabel_cls_head.py
index f1041d06079..71268bb9ea0 100644
--- a/src/otx/algo/classification/heads/hlabel_cls_head.py
+++ b/src/otx/algo/classification/heads/hlabel_cls_head.py
@@ -355,7 +355,7 @@ def __init__(
         self.fc_superclass = nn.Linear(in_channels * self.step_size[0] * self.step_size[1], num_multiclass_heads)
         self.attention_fc = nn.Linear(num_multiclass_heads, in_channels * self.step_size[0] * self.step_size[1])
         self.cbam = CBAM(in_channels)
-        self.fc_subclass = nn.Linear(in_channels * self.step_size[0] * self.step_size[1], num_single_label_classes)
+        self.fc_subclass = nn.Linear(in_channels * self.step_size[0] * self.step_size[1], num_classes)
 
         self._init_layers()
 
diff --git a/src/otx/algo/common/layers/position_embed.py b/src/otx/algo/common/layers/position_embed.py
index 5afe6010a5d..d875e68a25a 100644
--- a/src/otx/algo/common/layers/position_embed.py
+++ b/src/otx/algo/common/layers/position_embed.py
@@ -70,60 +70,6 @@ def forward(self, tensor_list: NestedTensor | torch.Tensor) -> torch.Tensor:
         return torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
 
 
-class PositionEmbeddingLearned(nn.Module):
-    """Absolute pos embedding, learned."""
-
-    def __init__(self, num_pos_feats: int = 256):
-        """Positional embedding."""
-        super().__init__()
-        self.row_embed = nn.Embedding(50, num_pos_feats)
-        self.col_embed = nn.Embedding(50, num_pos_feats)
-
-    def forward(self, tensor_list: NestedTensor) -> torch.Tensor:
-        """Forward pass of the PositionEmbeddingLearned module.
-
-        Args:
-            tensor_list (NestedTensor): Input tensor.
-
-        Returns:
-            torch.Tensor: Position embeddings.
-        """
-        x = tensor_list.tensors
-        h, w = x.shape[-2:]
-        i = torch.arange(w, device=x.device) / w * 49
-        j = torch.arange(h, device=x.device) / h * 49
-        x_emb = self.get_embed(i, self.col_embed)
-        y_emb = self.get_embed(j, self.row_embed)
-        return (
-            torch.cat(
-                [
-                    x_emb.unsqueeze(0).repeat(h, 1, 1),
-                    y_emb.unsqueeze(1).repeat(1, w, 1),
-                ],
-                dim=-1,
-            )
-            .permute(2, 0, 1)
-            .unsqueeze(0)
-            .repeat(x.shape[0], 1, 1, 1)
-        )
-
-    def get_embed(self, coord: torch.Tensor, embed: nn.Embedding) -> torch.Tensor:
-        """Get the embedding for the given coordinates.
-
-        Args:
-            coord (torch.Tensor): The coordinates.
-            embed (nn.Embedding): The embedding layer.
-
-        Returns:
-            torch.Tensor: The embedding for the coordinates.
-        """
-        floor_coord = coord.floor()
-        delta = (coord - floor_coord).unsqueeze(-1)
-        floor_coord = floor_coord.long()
-        ceil_coord = (floor_coord + 1).clamp(max=49)
-        return embed(floor_coord) * (1 - delta) + embed(ceil_coord) * delta
-
-
 def gen_sineembed_for_position(pos_tensor: torch.Tensor) -> torch.Tensor:
     """Generate sine embeddings for position tensor.
 
diff --git a/src/otx/algo/common/losses/cross_focal_loss.py b/src/otx/algo/common/losses/cross_focal_loss.py
index bfec15c0c84..457876a5986 100644
--- a/src/otx/algo/common/losses/cross_focal_loss.py
+++ b/src/otx/algo/common/losses/cross_focal_loss.py
@@ -8,7 +8,7 @@
 import torch
 import torch.nn.functional
 from torch import Tensor, nn
-from torch.cuda.amp import custom_fwd
+from torch.amp import custom_fwd
 
 from .focal_loss import py_sigmoid_focal_loss
 
@@ -79,7 +79,7 @@ def __init__(
 
         self.cls_criterion = cross_sigmoid_focal_loss
 
-    @custom_fwd(cast_inputs=torch.float32)
+    @custom_fwd(cast_inputs=torch.float32, device_type="cuda")
     def forward(
         self,
         pred: Tensor,
diff --git a/src/otx/algo/instance_segmentation/rtmdet_inst.py b/src/otx/algo/instance_segmentation/rtmdet_inst.py
index 9bf85fbcbe9..3625d46f874 100644
--- a/src/otx/algo/instance_segmentation/rtmdet_inst.py
+++ b/src/otx/algo/instance_segmentation/rtmdet_inst.py
@@ -130,6 +130,7 @@ def _exporter(self) -> OTXModelExporter:
                 "opset_version": 11,
                 "autograd_inlining": False,
             },
+            # TODO(Eugene): Add XAI support for RTMDetInst
             output_names=["bboxes", "labels", "masks", "feature_vector", "saliency_map"] if self.explain_mode else None,
         )
 
diff --git a/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py b/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py
index 02be0e943c7..0911d26050d 100644
--- a/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py
+++ b/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py
@@ -9,38 +9,14 @@
 import torch
 import torchvision
 from torch import nn
+from torchvision.models import get_model_weights
 from torchvision.models._utils import IntermediateLayerGetter
 
-from otx.algo.common.layers.position_embed import PositionEmbeddingLearned, PositionEmbeddingSine
+from otx.algo.common.layers.position_embed import PositionEmbeddingSine
 from otx.algo.modules.norm import FrozenBatchNorm2d
 from otx.algo.object_detection_3d.utils.utils import NestedTensor
 
 
-def build_position_encoding(
-    hidden_dim: int,
-    position_embedding: str | PositionEmbeddingSine | PositionEmbeddingLearned,
-) -> PositionEmbeddingSine | PositionEmbeddingLearned:
-    """Build the position encoding module.
-
-    Args:
-        hidden_dim (int): The hidden dimension.
-        position_embedding (Union[str, PositionEmbeddingSine, PositionEmbeddingLearned]): The position embedding type.
-
-    Returns:
-        Union[PositionEmbeddingSine, PositionEmbeddingLearned]: The position encoding module.
-    """
-    n_steps = hidden_dim // 2
-    if position_embedding in ("v2", "sine"):
-        position_embedding = PositionEmbeddingSine(n_steps, normalize=True)
-    elif position_embedding in ("v3", "learned"):
-        position_embedding = PositionEmbeddingLearned(n_steps)
-    else:
-        msg = f"not supported {position_embedding}"
-        raise ValueError(msg)
-
-    return position_embedding
-
-
 class BackboneBase(nn.Module):
     """BackboneBase module."""
 
@@ -85,7 +61,7 @@ def __init__(self, name: str, train_backbone: bool, return_interm_layers: bool,
         norm_layer = FrozenBatchNorm2d
         backbone = getattr(torchvision.models, name)(
             replace_stride_with_dilation=[False, False, dilation],
-            pretrained=True,
+            weights=get_model_weights(name).IMAGENET1K_V1,  # the same as pretrained=True
             norm_layer=norm_layer,
         )
         super().__init__(backbone, train_backbone, return_interm_layers)
@@ -99,13 +75,13 @@ class Joiner(nn.Sequential):
     def __init__(
         self,
         backbone: nn.Module,
-        position_embedding: PositionEmbeddingSine | PositionEmbeddingLearned,
+        position_embedding: PositionEmbeddingSine,
     ) -> None:
         """Initialize the Joiner module.
 
         Args:
             backbone (nn.Module): The backbone module.
-            position_embedding (Union[PositionEmbeddingSine, PositionEmbeddingLearned]): The position embedding module.
+            position_embedding (PositionEmbeddingSine): The position embedding module.
         """
         super().__init__(backbone, position_embedding)
         self.strides = backbone.strides
@@ -135,7 +111,6 @@ class BackboneBuilder:
             "return_interm_layers": True,
             "positional_encoding": {
                 "hidden_dim": 256,
-                "position_embedding": "sine",
             },
         },
     }
@@ -144,5 +119,6 @@ def __new__(cls, model_name: str) -> Joiner:
         """Constructor for Backbone MonoDetr."""
         # TODO (Kirill): change backbone to already implemented in OTX
         backbone = Backbone(**cls.CFG[model_name])
-        position_embedding = build_position_encoding(**cls.CFG[model_name]["positional_encoding"])
+        n_steps = cls.CFG[model_name]["positional_encoding"]["hidden_dim"] // 2
+        position_embedding = PositionEmbeddingSine(n_steps, normalize=True)
         return Joiner(backbone, position_embedding)
diff --git a/src/otx/algo/object_detection_3d/detectors/monodetr.py b/src/otx/algo/object_detection_3d/detectors/monodetr.py
index b102a054fe3..3b05a90827a 100644
--- a/src/otx/algo/object_detection_3d/detectors/monodetr.py
+++ b/src/otx/algo/object_detection_3d/detectors/monodetr.py
@@ -25,10 +25,10 @@ def __init__(
         backbone: nn.Module,
         depthaware_transformer: nn.Module,
         depth_predictor: nn.Module,
-        criterion: nn.Module,
         num_classes: int,
         num_queries: int,
         num_feature_levels: int,
+        criterion: nn.Module | None = None,
         aux_loss: bool = True,
         with_box_refine: bool = False,
         init_box: bool = False,
@@ -41,7 +41,7 @@ def __init__(
             backbone (nn.Module): torch module of the backbone to be used. See backbone.py
             depthaware_transformer (nn.Module): depth-aware transformer architecture. See depth_aware_transformer.py
             depth_predictor (nn.Module): depth predictor module
-            criterion (nn.Module): loss criterion module
+            criterion (nn.Module | None): loss criterion module
             num_classes (int): number of object classes
             num_queries (int): number of object queries, ie detection slot. This is the maximal number of objects
                        DETR can detect in a single image. For KITTI, we recommend 50 queries.
@@ -149,12 +149,17 @@ def forward(
         """Forward method of the MonoDETR model.
 
         Args:
-            images (list[Tensor]): images for each sample
-            calibs (Tensor): camera matrices for each sample
-            img_sizes (Tensor): image sizes for each sample
-            targets (list[dict[Tensor]): ground truth boxes and labels for each
-                sample
+            images (Tensor): images for each sample.
+            calibs (Tensor): camera matrices for each sample.
+            img_sizes (Tensor): image sizes for each sample.
+            targets (list[dict[str, Tensor]): ground truth boxes and labels for each
+                sample. Defaults to None.
             mode (str): The mode of operation. Defaults to "predict".
+
+        Returns:
+                dict[str, Tensor]: A dictionary of tensors. If mode is "loss", the
+                tensors are the loss values. If mode is "predict", the tensors are
+                the logits.
         """
         features, pos = self.backbone(images)
 
@@ -230,7 +235,7 @@ def forward(
 
             # depth_geo
             box2d_height_norm = outputs_coord[:, :, 4] + outputs_coord[:, :, 5]
-            box2d_height = torch.clamp(box2d_height_norm * img_sizes[:, 1:2], min=1.0)
+            box2d_height = torch.clamp(box2d_height_norm * img_sizes[:, :1], min=1.0)
             depth_geo = size3d[:, :, 0] / box2d_height * calibs[:, 0, 0].unsqueeze(1)
 
             # depth_reg
@@ -285,6 +290,9 @@ def forward(
             )
 
         if mode == "loss":
+            if self.criterion is None:
+                msg = "Criterion is not set for the model"
+                raise ValueError(msg)
             return self.criterion(outputs=out, targets=targets)
 
         return out
diff --git a/src/otx/algo/object_detection_3d/heads/depth_predictor.py b/src/otx/algo/object_detection_3d/heads/depth_predictor.py
index 4e5037c96d8..87827144b21 100644
--- a/src/otx/algo/object_detection_3d/heads/depth_predictor.py
+++ b/src/otx/algo/object_detection_3d/heads/depth_predictor.py
@@ -32,6 +32,8 @@ def __init__(
             depth_min (float): The minimum depth value.
             depth_max (float): The maximum depth value.
             hidden_dim (int): The dimension of the hidden layer.
+            activation (Callable[..., nn.Module], optional): The activation function.
+                Defaults to nn.ReLU.
         """
         super().__init__()
         self.depth_max = depth_max
diff --git a/src/otx/algo/object_detection_3d/heads/depthaware_transformer.py b/src/otx/algo/object_detection_3d/heads/depthaware_transformer.py
index 4269ba1950d..7592c312c05 100644
--- a/src/otx/algo/object_detection_3d/heads/depthaware_transformer.py
+++ b/src/otx/algo/object_detection_3d/heads/depthaware_transformer.py
@@ -4,7 +4,6 @@
 """depth aware transformer head for 3d object detection."""
 from __future__ import annotations
 
-import math
 from typing import Any, Callable, ClassVar
 
 import torch
@@ -101,84 +100,6 @@ def _reset_parameters(self) -> None:
         constant_(self.reference_points.bias.data, 0.0)
         normal_(self.level_embed)
 
-    def get_proposal_pos_embed(self, proposals: Tensor) -> Tensor:
-        """Generate position embeddings for proposal tensor.
-
-        Args:
-            proposals (Tensor): Proposal tensor of shape (N, L, 6).
-
-        TODO (Kirill): Not used. Remove this function?
-
-        Returns:
-            Tensor: Position embeddings for proposal tensor of shape (N, L, embedding_dim).
-        """
-        num_pos_feats = 128
-        temperature = 10000
-        scale = 2 * math.pi
-
-        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
-        dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
-        # N, L, 6
-        proposals = proposals.sigmoid() * scale
-        # N, L, 6, 128
-        pos = proposals[:, :, :, None] / dim_t
-        # N, L, 6, 64, 2
-        return torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
-
-    def gen_encoder_output_proposals(
-        self,
-        memory: Tensor,
-        memory_padding_mask: Tensor,
-        spatial_shapes: list[tuple[int, int]],
-    ) -> tuple[Tensor, Tensor]:
-        """Generate encoder output and proposals.
-
-        Args:
-            memory (Tensor): Memory tensor of shape (N, S, C).
-            memory_padding_mask (Tensor): Memory padding mask tensor of shape (N, S).
-            spatial_shapes (List[Tuple[int, int]]): List of spatial shapes.
-
-        TODO (Kirill): Not used. Remove this function?
-
-        Returns:
-            Tuple[Tensor, Tensor]: Encoder output tensor of shape (N, S, C) and proposals tensor of shape (N, L, 6).
-        """
-        n_, _, _ = memory.shape
-        proposals = []
-        _cur = 0
-        for lvl, (h_, w_) in enumerate(spatial_shapes):
-            mask_flatten_ = memory_padding_mask[:, _cur : (_cur + h_ * w_)].view(n_, h_, w_, 1)
-            valid_h = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
-            valid_w = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
-
-            grid_y, grid_x = torch.meshgrid(
-                torch.linspace(0, h_ - 1, h_, dtype=torch.float32, device=memory.device),
-                torch.linspace(0, w_ - 1, w_, dtype=torch.float32, device=memory.device),
-            )
-            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
-
-            scale = torch.cat([valid_w.unsqueeze(-1), valid_h.unsqueeze(-1)], 1).view(n_, 1, 1, 2)
-            grid = (grid.unsqueeze(0).expand(n_, -1, -1, -1) + 0.5) / scale
-
-            lr = torch.ones_like(grid) * 0.05 * (2.0**lvl)
-            tb = torch.ones_like(grid) * 0.05 * (2.0**lvl)
-            wh = torch.cat((lr, tb), -1)
-
-            proposal = torch.cat((grid, wh), -1).view(n_, -1, 6)
-            proposals.append(proposal)
-            _cur += h_ * w_
-        output_proposals = torch.cat(proposals, 1)
-        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
-        output_proposals = torch.log(output_proposals / (1 - output_proposals))
-        output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float("inf"))
-        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
-
-        output_memory = memory
-        output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
-        output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
-        output_memory = self.enc_output_norm(self.enc_output(output_memory))
-        return output_memory, output_proposals
-
     def get_valid_ratio(self, mask: Tensor) -> Tensor:
         """Calculate the valid ratio of the mask.
 
@@ -616,7 +537,7 @@ def forward(
                 intermediate_reference_dims,
             )
 
-        return output, reference_points
+        return output, reference_points, None
 
 
 class DepthAwareTransformerBuilder:
diff --git a/src/otx/algo/object_detection_3d/losses/ddn_loss.py b/src/otx/algo/object_detection_3d/losses/ddn_loss.py
index e3a4238be03..671033a347a 100644
--- a/src/otx/algo/object_detection_3d/losses/ddn_loss.py
+++ b/src/otx/algo/object_detection_3d/losses/ddn_loss.py
@@ -22,13 +22,13 @@ def compute_fg_mask(
     """Compute foreground mask for images.
 
     Args:
-        gt_boxes2d [torch.Tensor(B, N, 4)]: 2D box labels
-        shape [Tuple[int, int]]: Foreground mask desired shape
-        downsample_factor [int]: Downsample factor for image
-        device [torch.device]: Foreground mask desired device
+        gt_boxes2d (torch.Tensor): 2D box labels.
+        shape (Tuple[int, int]): Foreground mask desired shape.
+        downsample_factor (int): Downsample factor for image.
+        device (torch.device): Foreground mask desired device.
 
     Returns:
-        fg_mask [torch.Tensor(shape)]: Foreground mask
+        fg_mask (torch.Tensor(shape)]: Foreground mask.
     """
     if device is None:
         device = torch.device("cpu")
@@ -58,9 +58,9 @@ def __init__(self, fg_weight: float, bg_weight: float, downsample_factor: int =
         """Initialize fixed foreground/background loss balancer.
 
         Args:
-            fg_weight [float]: Foreground loss weight
-            bg_weight [float]: Background loss weight
-            downsample_factor [int]: Depth map downsample factor
+            fg_weight (float): Foreground loss weight.
+            bg_weight (float): Background loss weight.
+            downsample_factor (int): Depth map downsample factor.
         """
         super().__init__()
         self.fg_weight = fg_weight
@@ -76,12 +76,11 @@ def forward(
         """Forward pass.
 
         Args:
-            loss [torch.Tensor(B, H, W)]: Pixel-wise loss
-            gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing
+            loss (torch.Tensor): Pixel-wise loss.
+            gt_boxes2d (torch.Tensor): 2D box labels for foreground/background balancing.
 
         Returns:
-            loss [torch.Tensor(1)]: Total loss after foreground/background balancing
-            tb_dict [dict[float]]: All losses to log in tensorboard
+            loss (torch.Tensor): Total loss after foreground/background balancing.
         """
         # Compute masks
         fg_mask = compute_fg_mask(
@@ -120,13 +119,11 @@ def __init__(
         """Initializes DDNLoss module.
 
         Args:
-            weight [float]: Loss function weight
-            alpha [float]: Alpha value for Focal Loss
-            gamma [float]: Gamma value for Focal Loss
-            disc_cfg [dict]: Depth discretiziation configuration
-            fg_weight [float]: Foreground loss weight
-            bg_weight [float]: Background loss weight
-            downsample_factor [int]: Depth map downsample factor
+            alpha (float): Alpha value for Focal Loss.
+            gamma (float): Gamma value for Focal Loss.
+            fg_weight (float): Foreground loss weight.
+            bg_weight (float): Background loss weight.
+            downsample_factor (int): Depth map downsample factor.
         """
         super().__init__()
         self.balancer = Balancer(downsample_factor=downsample_factor, fg_weight=fg_weight, bg_weight=bg_weight)
@@ -146,10 +143,10 @@ def build_target_depth_from_3dcenter(
         """Builds target depth map from 3D center depth.
 
         Args:
-            depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits
-            gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing
-            gt_center_depth [torch.Tensor(B, N)]: 3D center depth
-            num_gt_per_img: [int]: Number of ground truth boxes per image
+            depth_logits: (torch.Tensor): Predicted depth logits.
+            gt_boxes2d (torch.Tensor)): 2D box labels for foreground/background balancing.
+            gt_center_depth (torch.Tensor): 3D center depth.
+            num_gt_per_img: (int): Number of ground truth boxes per image.
         """
         b, _, h, w = depth_logits.shape
         depth_maps = torch.zeros((b, h, w), device=depth_logits.device, dtype=depth_logits.dtype)
@@ -185,18 +182,18 @@ def bin_depths(
         """Converts depth map into bin indices.
 
         Args:
-            depth_map [torch.Tensor(H, W)]: Depth Map
-            mode [string]: Discretiziation mode (See https://arxiv.org/pdf/2005.13423.pdf for more details)
-                UD: Uniform discretiziation
-                LID: Linear increasing discretiziation
-                SID: Spacing increasing discretiziation
-            depth_min [float]: Minimum depth value
-            depth_max [float]: Maximum depth value
-            num_bins [int]: Number of depth bins
-            target [bool]: Whether the depth bins indices will be used for a target tensor in loss comparison
+            depth_map (torch.Tensor): Depth Map.
+            mode (string): Discretiziation mode (See https://arxiv.org/pdf/2005.13423.pdf for more details).
+                UD: Uniform discretiziation.
+                LID: Linear increasing discretiziation.
+                SID: Spacing increasing discretiziation.
+            depth_min (float): Minimum depth value.
+            depth_max (float): Maximum depth value.
+            num_bins (int): Number of depth bins.
+            target (bool): Whether the depth bins indices will be used for a target tensor in loss comparison.
 
         Returns:
-            indices [torch.Tensor(H, W)]: Depth bin indices
+            indices (torch.Tensor): Depth bin indices.
         """
         if mode == "UD":
             bin_size = (depth_max - depth_min) / num_bins
@@ -233,13 +230,13 @@ def forward(
         """Gets depth_map loss.
 
         Args:
-            depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits
-            gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing
-            num_gt_per_img: [int]: Number of ground truth boxes per image
-            gt_center_depth: [torch.Tensor(B, N)]: 3D center depth
+            depth_logits: (torch.Tensor): Predicted depth logits.
+            gt_boxes2d (torch.Tensor): 2D box labels for foreground/background balancing.
+            num_gt_per_img: (int): Number of ground truth boxes per image.
+            gt_center_depth: (torch.Tensor): 3D center depth.
 
         Returns:
-            loss [torch.Tensor(1)]: Depth classification network loss
+            loss (torch.Tensor): Depth classification network loss.
         """
         # Bin depth map to create target
         depth_maps = self.build_target_depth_from_3dcenter(depth_logits, gt_boxes2d, gt_center_depth, num_gt_per_img)
diff --git a/src/otx/algo/object_detection_3d/losses/monodetr_loss.py b/src/otx/algo/object_detection_3d/losses/monodetr_loss.py
index ebc98d45a51..0f2d85d0565 100644
--- a/src/otx/algo/object_detection_3d/losses/monodetr_loss.py
+++ b/src/otx/algo/object_detection_3d/losses/monodetr_loss.py
@@ -29,11 +29,10 @@ def __init__(self, num_classes: int, weight_dict: dict, focal_alpha: float, grou
         """MonoDETRCriterion.
 
         Args:
-            num_classes: number of object categories, omitting the special no-object category
-            matcher: module able to compute a matching between targets and proposals
-            weight_dict: dict containing as key the names of the losses and as values their relative weight.
-            focal_alpha: alpha in Focal Loss
-            group_num: number of groups for data parallelism
+            num_classes (int): number of object categories, omitting the special no-object category.
+            weight_dict (dict): dict containing as key the names of the losses and as values their relative weight.
+            focal_alpha (float): alpha in Focal Loss.
+            group_num (int): number of groups for data parallelism.
         """
         super().__init__()
         self.num_classes = num_classes
@@ -47,7 +46,15 @@ def __init__(self, num_classes: int, weight_dict: dict, focal_alpha: float, grou
         self.group_num = group_num
 
     def loss_labels(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Classification loss."""
+        """Classification loss.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         src_logits = outputs["scores"]
 
         idx = self._get_src_permutation_idx(indices)
@@ -76,7 +83,15 @@ def loss_labels(self, outputs: dict, targets: list, indices: list, num_boxes: in
         return {"loss_ce": loss_ce}
 
     def loss_3dcenter(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute the loss for the 3D center prediction."""
+        """Compute the loss for the 3D center prediction.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         idx = self._get_src_permutation_idx(indices)
         src_3dcenter = outputs["boxes_3d"][:, :, 0:2][idx]
         target_3dcenter = torch.cat([t["boxes_3d"][:, 0:2][i] for t, (_, i) in zip(targets, indices)], dim=0)
@@ -85,7 +100,15 @@ def loss_3dcenter(self, outputs: dict, targets: list, indices: list, num_boxes:
         return {"loss_center": loss_3dcenter.sum() / num_boxes}
 
     def loss_boxes(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute l1 loss."""
+        """Compute l1 loss.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         idx = self._get_src_permutation_idx(indices)
         src_2dboxes = outputs["boxes_3d"][:, :, 2:6][idx]
         target_2dboxes = torch.cat([t["boxes_3d"][:, 2:6][i] for t, (_, i) in zip(targets, indices)], dim=0)
@@ -95,7 +118,15 @@ def loss_boxes(self, outputs: dict, targets: list, indices: list, num_boxes: int
         return {"loss_bbox": loss_bbox.sum() / num_boxes}
 
     def loss_giou(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute the GIoU loss."""
+        """Compute the GIoU loss.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         # giou
         idx = self._get_src_permutation_idx(indices)
         src_boxes = outputs["boxes_3d"][idx]
@@ -104,7 +135,15 @@ def loss_giou(self, outputs: dict, targets: list, indices: list, num_boxes: int)
         return {"loss_giou": loss_giou}
 
     def loss_depths(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute the loss for the depth prediction."""
+        """Compute the loss for the depth prediction.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch
+        """
         idx = self._get_src_permutation_idx(indices)
 
         src_depths = outputs["depth"][idx]
@@ -117,7 +156,15 @@ def loss_depths(self, outputs: dict, targets: list, indices: list, num_boxes: in
         return {"loss_depth": depth_loss.sum() / num_boxes}
 
     def loss_dims(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute the loss for the dimension prediction."""
+        """Compute the loss for the dimension prediction.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         idx = self._get_src_permutation_idx(indices)
         src_dims = outputs["size_3d"][idx]
         target_dims = torch.cat([t["size_3d"][i] for t, (_, i) in zip(targets, indices)], dim=0)
@@ -131,7 +178,15 @@ def loss_dims(self, outputs: dict, targets: list, indices: list, num_boxes: int)
         return {"loss_dim": dim_loss.sum() / num_boxes}
 
     def loss_angles(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Compute the loss for the angle prediction."""
+        """Compute the loss for the angle prediction.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         idx = self._get_src_permutation_idx(indices)
         heading_input = outputs["heading_angle"][idx]
         target_heading_angle = torch.cat([t["heading_angle"][i] for t, (_, i) in zip(targets, indices)], dim=0)
@@ -158,7 +213,15 @@ def loss_angles(self, outputs: dict, targets: list, indices: list, num_boxes: in
         return {"loss_angle": angle_loss.sum() / num_boxes}
 
     def loss_depth_map(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]:
-        """Depth map loss."""
+        """Depth map loss.
+
+        Args:
+            outputs (dict): dict of tensors, see the output specification of the model for the format.
+            targets (list): list of dicts, such that len(targets) == batch_size.
+                   The expected keys in each dict depends on the losses applied, see each loss' doc.
+            indices (list): list of tuples, such that len(indices) == batch_size.
+            num_boxes (int): number of boxes in the batch.
+        """
         depth_map_logits = outputs["pred_depth_map_logits"]
 
         num_gt_per_img = [len(t["boxes"]) for t in targets]
@@ -174,6 +237,7 @@ def _get_src_permutation_idx(
         self,
         indices: list[tuple[torch.Tensor, torch.Tensor]],
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Get the indices necessary to compute the loss."""
         # permute predictions following indices
         batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
         src_idx = torch.cat([src for (src, _) in indices])
@@ -183,6 +247,7 @@ def _get_tgt_permutation_idx(
         self,
         indices: list[tuple[torch.Tensor, torch.Tensor]],
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Get the indices necessary to compute the loss."""
         # permute targets following indices
         batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
         tgt_idx = torch.cat([tgt for (_, tgt) in indices])
@@ -210,9 +275,9 @@ def forward(
         """This performs the loss computation.
 
         Args:
-             outputs: dict of tensors, see the output specification of the model for the format
-             targets: list of dicts, such that len(targets) == batch_size.
-                      The expected keys in each dict depends on the losses applied, see each loss' doc
+             outputs (dict): dict of tensors, see the output specification of the model for the format.
+             targets (list): list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc.
         """
         outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
         group_num = self.group_num if self.training else 1
diff --git a/src/otx/algo/object_detection_3d/monodetr3d.py b/src/otx/algo/object_detection_3d/monodetr3d.py
index 2ea42e52f95..18d3c072556 100644
--- a/src/otx/algo/object_detection_3d/monodetr3d.py
+++ b/src/otx/algo/object_detection_3d/monodetr3d.py
@@ -7,19 +7,13 @@
 
 from typing import Any
 
-import numpy as np
 import torch
-from torch import Tensor
-from torchvision.ops import box_convert
 
 from otx.algo.object_detection_3d.backbones.monodetr_resnet import BackboneBuilder
 from otx.algo.object_detection_3d.detectors.monodetr import MonoDETR
 from otx.algo.object_detection_3d.heads.depth_predictor import DepthPredictor
 from otx.algo.object_detection_3d.heads.depthaware_transformer import DepthAwareTransformerBuilder
 from otx.algo.object_detection_3d.losses import MonoDETRCriterion
-from otx.algo.object_detection_3d.utils.utils import box_cxcylrtb_to_xyxy
-from otx.core.data.entity.base import OTXBatchLossEntity
-from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DBatchPredEntity
 from otx.core.exporter.base import OTXModelExporter
 from otx.core.exporter.detection_3d import OTXObjectDetection3DExporter
 from otx.core.model.detection_3d import OTX3DDetectionModel
@@ -28,9 +22,8 @@
 class MonoDETR3D(OTX3DDetectionModel):
     """OTX Detection model class for MonoDETR3D."""
 
-    mean: tuple[float, float, float] = (0.485, 0.456, 0.406)
-    std: tuple[float, float, float] = (0.229, 0.224, 0.225)
-    input_size: tuple[int, int] = (384, 1280)  # HxW
+    mean: tuple[float, float, float] = (123.675, 116.28, 103.53)
+    std: tuple[float, float, float] = (58.395, 57.12, 57.375)
     load_from: str | None = None
 
     def _build_model(self, num_classes: int) -> MonoDETR:
@@ -62,73 +55,6 @@ def _build_model(self, num_classes: int) -> MonoDETR:
             init_box=False,
         )
 
-    def _customize_inputs(
-        self,
-        entity: Det3DBatchDataEntity,
-    ) -> dict[str, Any]:
-        # prepare bboxes for the model
-        targets_list = []
-        img_sizes = torch.from_numpy(np.array([img_info.ori_shape for img_info in entity.imgs_info])).to(
-            device=entity.images.device,
-        )
-        key_list = ["labels", "boxes", "depth", "size_3d", "heading_angle", "boxes_3d"]
-        for bz in range(len(entity.imgs_info)):
-            target_dict = {}
-            for key in key_list:
-                target_dict[key] = getattr(entity, key)[bz]
-            targets_list.append(target_dict)
-
-        return {
-            "images": entity.images,
-            "calibs": torch.cat([p2.unsqueeze(0) for p2 in entity.calib_matrix], dim=0),
-            "targets": targets_list,
-            "img_sizes": img_sizes,
-            "mode": "loss" if self.training else "predict",
-        }
-
-    def _customize_outputs(
-        self,
-        outputs: dict[str, torch.Tensor],
-        inputs: Det3DBatchDataEntity,
-    ) -> Det3DBatchPredEntity | OTXBatchLossEntity:
-        if self.training:
-            if not isinstance(outputs, dict):
-                raise TypeError(outputs)
-
-            losses = OTXBatchLossEntity()
-            for k, v in outputs.items():
-                if isinstance(v, list):
-                    losses[k] = sum(v)
-                elif isinstance(v, Tensor):
-                    losses[k] = v
-                else:
-                    msg = "Loss output should be list or torch.tensor but got {type(v)}"
-                    raise TypeError(msg)
-            return losses
-
-        labels, scores, size_3d, heading_angle, boxes_3d, depth = self.extract_dets_from_outputs(outputs)
-        # bbox 2d decoding
-        boxes_2d = box_cxcylrtb_to_xyxy(boxes_3d)
-        xywh_2d = box_convert(boxes_2d, "xyxy", "cxcywh")
-        # size 2d decoding
-        size_2d = xywh_2d[:, :, 2:4]
-
-        return Det3DBatchPredEntity(
-            batch_size=inputs.batch_size,
-            images=inputs.images,
-            imgs_info=inputs.imgs_info,
-            calib_matrix=inputs.calib_matrix,
-            boxes=boxes_2d,
-            labels=labels,
-            boxes_3d=boxes_3d,
-            size_2d=size_2d,
-            size_3d=size_3d,
-            depth=depth,
-            heading_angle=heading_angle,
-            scores=scores,
-            original_kitti_format=[None],
-        )
-
     def configure_optimizers(self) -> tuple[list[torch.optim.Optimizer], list[dict[str, Any]]]:
         """Configure an optimizer and learning-rate schedulers.
 
@@ -240,7 +166,7 @@ def _exporter(self) -> OTXModelExporter:
                 "opset_version": 16,
             },
             input_names=["images", "calib_matrix", "img_sizes"],
-            output_names=["scores", "boxes_3d", "size_3d", "heading_angle", "depth"],
+            output_names=["scores", "boxes_3d", "size_3d", "depth", "heading_angle"],
         )
 
     @property
diff --git a/src/otx/algo/samplers/balanced_sampler.py b/src/otx/algo/samplers/balanced_sampler.py
index 287bbf1dcf4..4b6cfb56caa 100644
--- a/src/otx/algo/samplers/balanced_sampler.py
+++ b/src/otx/algo/samplers/balanced_sampler.py
@@ -65,7 +65,7 @@ def __init__(
         self.img_indices = {k: torch.tensor(v, dtype=torch.int64) for k, v in ann_stats.items() if len(v) > 0}
         self.num_cls = len(self.img_indices.keys())
         self.data_length = len(self.dataset)
-        self.num_trials = int(self.data_length / self.num_cls)
+        self.num_trials = max(int(self.data_length / self.num_cls), 1)
 
         if efficient_mode:
             # Reduce the # of sampling (sampling data for a single epoch)
diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py
index 428a489e1fa..dc7f7f9b242 100644
--- a/src/otx/algo/segmentation/huggingface_model.py
+++ b/src/otx/algo/segmentation/huggingface_model.py
@@ -162,4 +162,8 @@ def _exporter(self) -> OTXModelExporter:
 
     def forward_for_tracing(self, image: torch.Tensor) -> torch.Tensor | dict[str, torch.Tensor]:
         """Model forward function used for the model tracing during model exportation."""
+        if self.explain_mode:
+            msg = "Explain mode is not supported for this model."
+            raise NotImplementedError(msg)
+
         return self.model(image)
diff --git a/src/otx/algo/segmentation/litehrnet.py b/src/otx/algo/segmentation/litehrnet.py
index 33269e04532..fd153877ccd 100644
--- a/src/otx/algo/segmentation/litehrnet.py
+++ b/src/otx/algo/segmentation/litehrnet.py
@@ -81,7 +81,7 @@ def _exporter(self) -> OTXModelExporter:
             swap_rgb=False,
             via_onnx=False,
             onnx_export_configuration={"operator_export_type": OperatorExportTypes.ONNX_ATEN_FALLBACK},
-            output_names=None,
+            output_names=["preds", "feature_vector"] if self.explain_mode else None,
         )
 
     @property
diff --git a/src/otx/algo/segmentation/segmentors/base_model.py b/src/otx/algo/segmentation/segmentors/base_model.py
index 9cad16b45ea..370ac795d73 100644
--- a/src/otx/algo/segmentation/segmentors/base_model.py
+++ b/src/otx/algo/segmentation/segmentors/base_model.py
@@ -10,6 +10,8 @@
 import torch.nn.functional as f
 from torch import Tensor, nn
 
+from otx.algo.explain.explain_algo import feature_vector_fn
+
 if TYPE_CHECKING:
     from otx.core.data.entity.base import ImageInfo
 
@@ -58,7 +60,7 @@ def forward(
                 - If mode is "predict", returns the predicted outputs.
                 - Otherwise, returns the model outputs after interpolation.
         """
-        outputs = self.extract_features(inputs)
+        enc_feats, outputs = self.extract_features(inputs)
         outputs = f.interpolate(outputs, size=inputs.size()[2:], mode="bilinear", align_corners=True)
 
         if mode == "tensor":
@@ -76,12 +78,19 @@ def forward(
         if mode == "predict":
             return outputs.argmax(dim=1)
 
+        if mode == "explain":
+            feature_vector = feature_vector_fn(enc_feats)
+            return {
+                "preds": outputs,
+                "feature_vector": feature_vector,
+            }
+
         return outputs
 
-    def extract_features(self, inputs: Tensor) -> Tensor:
+    def extract_features(self, inputs: Tensor) -> tuple[Tensor, Tensor]:
         """Extract features from the backbone and head."""
         enc_feats = self.backbone(inputs)
-        return self.decode_head(enc_feats)
+        return enc_feats, self.decode_head(enc_feats)
 
     def calculate_loss(
         self,
diff --git a/src/otx/core/config/data.py b/src/otx/core/config/data.py
index 5af3016e0e8..111e76e9261 100644
--- a/src/otx/core/config/data.py
+++ b/src/otx/core/config/data.py
@@ -29,6 +29,8 @@ class SubsetConfig:
             (`TransformLibType.MMCV`, `TransformLibType.MMPRETRAIN`, ...).
         transform_lib_type (TransformLibType): Transform library type used by this subset.
         num_workers (int): Number of workers for the dataloader of this subset.
+        sampler (SamplerConfig | None): Sampler configuration for the dataloader of this subset.
+        to_tv_image (bool): Whether to convert image to torch tensor.
         input_size (int | tuple[int, int] | None) :
             input size model expects. If $(input_size) exists in transforms, it will be replaced with this value.
 
diff --git a/src/otx/core/config/hpo.py b/src/otx/core/config/hpo.py
index 8d4dd085955..29695631ef8 100644
--- a/src/otx/core/config/hpo.py
+++ b/src/otx/core/config/hpo.py
@@ -7,7 +7,7 @@
 
 from dataclasses import dataclass
 from pathlib import Path  # noqa: TCH003
-from typing import Any, Literal
+from typing import Any, Callable, Literal
 
 import torch
 
@@ -23,7 +23,12 @@
 
 @dataclass
 class HpoConfig:
-    """DTO for HPO configuration."""
+    """DTO for HPO configuration.
+
+    progress_update_callback (Callable[[int | float], None] | None):
+        callback to update progress. If it's given, it's called with progress every second.
+    callbacks_to_exclude (list[str] | str | None): List of name of callbacks to exclude during HPO.
+    """
 
     search_space: dict[str, dict[str, Any]] | str | Path | None = None
     save_path: str | None = None
@@ -40,3 +45,5 @@ class HpoConfig:
     asynchronous_sha: bool = num_workers > 1
     metric_name: str | None = None
     adapt_bs_search_space_max_val: Literal["None", "Safe", "Full"] = "None"
+    progress_update_callback: Callable[[int | float], None] | None = None
+    callbacks_to_exclude: list[str] | str | None = None
diff --git a/src/otx/core/data/dataset/object_detection_3d.py b/src/otx/core/data/dataset/object_detection_3d.py
index 7e7f294c58b..06df0136392 100644
--- a/src/otx/core/data/dataset/object_detection_3d.py
+++ b/src/otx/core/data/dataset/object_detection_3d.py
@@ -3,8 +3,6 @@
 #
 """Module for OTX3DObjectDetectionDataset."""
 
-# mypy: ignore-errors
-
 from __future__ import annotations
 
 from copy import deepcopy
@@ -12,12 +10,8 @@
 from typing import TYPE_CHECKING, Any, Callable, List, Union
 
 import numpy as np
-import torch
 from datumaro import Image
-from PIL import Image as PILImage
-from torchvision import tv_tensors
 
-from otx.core.data.dataset.utils.kitti_utils import Calibration, affine_transform, angle2class, get_affine_transform
 from otx.core.data.entity.base import ImageInfo
 from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DDataEntity
 from otx.core.data.mem_cache import NULL_MEM_CACHE_HANDLER, MemCacheHandlerBase
@@ -27,7 +21,7 @@
 from .base import OTXDataset
 
 if TYPE_CHECKING:
-    from datumaro import Bbox, DatasetSubset
+    from datumaro import DatasetSubset
 
 
 Transforms = Union[Compose, Callable, List[Callable], dict[str, Compose | Callable | List[Callable]]]
@@ -45,10 +39,8 @@ def __init__(
         max_refetch: int = 1000,
         image_color_channel: ImageColorChannel = ImageColorChannel.RGB,
         stack_images: bool = True,
-        to_tv_image: bool = True,
+        to_tv_image: bool = False,
         max_objects: int = 50,
-        depth_threshold: int = 65,
-        resolution: tuple[int, int] = (1280, 384),  # (W, H)
     ) -> None:
         super().__init__(
             dm_subset,
@@ -61,239 +53,56 @@ def __init__(
             to_tv_image,
         )
         self.max_objects = max_objects
-        self.depth_threshold = depth_threshold
-        self.resolution = np.array(resolution)  # TODO(Kirill): make it configurable
         self.subset_type = list(self.dm_subset.get_subset_info())[-1].split(":")[0]
 
     def _get_item_impl(self, index: int) -> Det3DDataEntity | None:
         entity = self.dm_subset[index]
         image = entity.media_as(Image)
-        image = self._get_img_data_and_shape(image)[0]
-        calib = Calibration(entity.attributes["calib_path"])
-        original_kitti_format = None  # don't use for training
-        if self.subset_type != "train":
-            # TODO (Kirill): remove this or duplication of the inputs
-            annotations_copy = deepcopy(entity.annotations)
-            original_kitti_format = [obj.attributes for obj in annotations_copy]
-            # decode original kitti format for metric calculation
-            for i, anno_dict in enumerate(original_kitti_format):
-                anno_dict["name"] = self.label_info.label_names[annotations_copy[i].label]
-                anno_dict["bbox"] = annotations_copy[i].points
-                dimension = anno_dict["dimensions"]
-                anno_dict["dimensions"] = [dimension[2], dimension[0], dimension[1]]
-            original_kitti_format = self._reformate_for_kitti_metric(original_kitti_format)
-        # decode labels for training
-        inputs, targets, ori_img_shape = self._decode_item(
-            PILImage.fromarray(image),
-            entity.annotations,
-            calib,
-        )
-        # normilize image
-        inputs = self._apply_transforms(torch.as_tensor(inputs, dtype=torch.float32))
-        return Det3DDataEntity(
-            image=inputs,
+        image, ori_img_shape = self._get_img_data_and_shape(image)
+        calib = self.get_calib_from_file(entity.attributes["calib_path"])
+        annotations_copy = deepcopy(entity.annotations)
+        datumaro_kitti_format = [obj.attributes for obj in annotations_copy]
+
+        # decode original kitti format for metric calculation
+        for i, anno_dict in enumerate(datumaro_kitti_format):
+            anno_dict["name"] = (
+                self.label_info.label_names[annotations_copy[i].label]
+                if self.subset_type != "train"
+                else annotations_copy[i].label
+            )
+            anno_dict["bbox"] = annotations_copy[i].points
+            dimension = anno_dict["dimensions"]
+            anno_dict["dimensions"] = [dimension[2], dimension[0], dimension[1]]
+        original_kitti_format = self._reformate_for_kitti_metric(datumaro_kitti_format)
+
+        entity = Det3DDataEntity(
+            image=image,
             img_info=ImageInfo(
                 img_idx=index,
-                img_shape=inputs.shape[1:],
-                ori_shape=ori_img_shape,  # TODO(Kirill): curently we use WxH here, make it HxW
+                img_shape=ori_img_shape,
+                ori_shape=ori_img_shape,
                 image_color_channel=self.image_color_channel,
                 ignored_labels=[],
             ),
-            boxes=tv_tensors.BoundingBoxes(
-                targets["boxes"],
-                format=tv_tensors.BoundingBoxFormat.XYXY,
-                canvas_size=inputs.shape[1:],
-                dtype=torch.float32,
-            ),
-            labels=torch.as_tensor(targets["labels"], dtype=torch.long),
-            calib_matrix=torch.as_tensor(calib.P2, dtype=torch.float32),
-            boxes_3d=torch.as_tensor(targets["boxes_3d"], dtype=torch.float32),
-            size_2d=torch.as_tensor(targets["size_2d"], dtype=torch.float32),
-            size_3d=torch.as_tensor(targets["size_3d"], dtype=torch.float32),
-            depth=torch.as_tensor(targets["depth"], dtype=torch.float32),
-            heading_angle=torch.as_tensor(
-                np.concatenate([targets["heading_bin"], targets["heading_res"]], axis=1),
-                dtype=torch.float32,
-            ),
+            boxes=np.zeros((self.max_objects, 4), dtype=np.float32),
+            labels=np.zeros((self.max_objects), dtype=np.int8),
+            calib_matrix=calib,
+            boxes_3d=np.zeros((self.max_objects, 6), dtype=np.float32),
+            size_2d=np.zeros((self.max_objects, 2), dtype=np.float32),
+            size_3d=np.zeros((self.max_objects, 3), dtype=np.float32),
+            depth=np.zeros((self.max_objects, 1), dtype=np.float32),
+            heading_angle=np.zeros((self.max_objects, 2), dtype=np.float32),
             original_kitti_format=original_kitti_format,
         )
 
+        return self._apply_transforms(entity)
+
     @property
     def collate_fn(self) -> Callable:
         """Collection function to collect DetDataEntity into DetBatchDataEntity in data loader."""
         return partial(Det3DBatchDataEntity.collate_fn, stack_images=self.stack_images)
 
-    def _decode_item(self, img: PILImage, annotations: list[Bbox], calib: Calibration) -> tuple:  # noqa: C901
-        """Decode item for training."""
-        # data augmentation for image
-        img_size = np.array(img.size)
-        bbox2d = np.array([ann.points for ann in annotations])
-        center = img_size / 2
-        crop_size, crop_scale = img_size, 1
-        random_flip_flag = False
-        # TODO(Kirill): add data augmentation for 3d, remove them from here.
-        if self.subset_type == "train":
-            if np.random.random() < 0.5:
-                random_flip_flag = True
-                img = img.transpose(PILImage.FLIP_LEFT_RIGHT)
-
-            if np.random.random() < 0.5:
-                scale = 0.05
-                shift = 0.05
-                crop_scale = np.clip(np.random.randn() * scale + 1, 1 - scale, 1 + scale)
-                crop_size = img_size * crop_scale
-                center[0] += img_size[0] * np.clip(np.random.randn() * shift, -2 * shift, 2 * shift)
-                center[1] += img_size[1] * np.clip(np.random.randn() * shift, -2 * shift, 2 * shift)
-
-        # add affine transformation for 2d images.
-        trans, trans_inv = get_affine_transform(center, crop_size, 0, self.resolution, inv=1)
-        img = img.transform(
-            tuple(self.resolution.tolist()),
-            method=PILImage.AFFINE,
-            data=tuple(trans_inv.reshape(-1).tolist()),
-            resample=PILImage.BILINEAR,
-        )
-        img = np.array(img).astype(np.float32)
-        img = img.transpose(2, 0, 1)  # C * H * W -> (384 * 1280)
-        #  ============================   get labels   ==============================
-        # data augmentation for labels
-        annotations_list: list[dict[str, Any]] = [ann.attributes for ann in annotations]
-        for i, obj in enumerate(annotations_list):
-            obj["label"] = annotations[i].label
-            obj["location"] = np.array(obj["location"])
-
-        if random_flip_flag:
-            for i in range(bbox2d.shape[0]):
-                [x1, _, x2, _] = bbox2d[i]
-                bbox2d[i][0], bbox2d[i][2] = img_size[0] - x2, img_size[0] - x1
-                annotations_list[i]["alpha"] = np.pi - annotations_list[i]["alpha"]
-                annotations_list[i]["rotation_y"] = np.pi - annotations_list[i]["rotation_y"]
-                if annotations_list[i]["alpha"] > np.pi:
-                    annotations_list[i]["alpha"] -= 2 * np.pi  # check range
-                if annotations_list[i]["alpha"] < -np.pi:
-                    annotations_list[i]["alpha"] += 2 * np.pi
-                if annotations_list[i]["rotation_y"] > np.pi:
-                    annotations_list[i]["rotation_y"] -= 2 * np.pi
-                if annotations_list[i]["rotation_y"] < -np.pi:
-                    annotations_list[i]["rotation_y"] += 2 * np.pi
-
-        # labels encoding
-        mask_2d = np.zeros((self.max_objects), dtype=bool)
-        labels = np.zeros((self.max_objects), dtype=np.int8)
-        depth = np.zeros((self.max_objects, 1), dtype=np.float32)
-        heading_bin = np.zeros((self.max_objects, 1), dtype=np.int64)
-        heading_res = np.zeros((self.max_objects, 1), dtype=np.float32)
-        size_2d = np.zeros((self.max_objects, 2), dtype=np.float32)
-        size_3d = np.zeros((self.max_objects, 3), dtype=np.float32)
-        src_size_3d = np.zeros((self.max_objects, 3), dtype=np.float32)
-        boxes = np.zeros((self.max_objects, 4), dtype=np.float32)
-        boxes_3d = np.zeros((self.max_objects, 6), dtype=np.float32)
-
-        object_num = len(annotations) if len(annotations) < self.max_objects else self.max_objects
-        for i in range(object_num):
-            cur_obj = annotations_list[i]
-            # ignore the samples beyond the threshold [hard encoding]
-            if cur_obj["location"][-1] > self.depth_threshold and cur_obj["location"][-1] < 2:
-                continue
-
-            # process 2d bbox & get 2d center
-            bbox_2d = bbox2d[i].copy()
-
-            # add affine transformation for 2d boxes.
-            bbox_2d[:2] = affine_transform(bbox_2d[:2], trans)
-            bbox_2d[2:] = affine_transform(bbox_2d[2:], trans)
-
-            # process 3d center
-            center_2d = np.array(
-                [(bbox_2d[0] + bbox_2d[2]) / 2, (bbox_2d[1] + bbox_2d[3]) / 2],
-                dtype=np.float32,
-            )  # W * H
-            corner_2d = bbox_2d.copy()
-
-            center_3d = np.array(
-                cur_obj["location"]
-                + [
-                    0,
-                    -cur_obj["dimensions"][0] / 2,
-                    0,
-                ],
-            )  # real 3D center in 3D space
-            center_3d = center_3d.reshape(-1, 3)  # shape adjustment (N, 3)
-            center_3d, _ = calib.rect_to_img(center_3d)  # project 3D center to image plane
-            center_3d = center_3d[0]  # shape adjustment
-            if random_flip_flag:  # random flip for center3d
-                center_3d[0] = img_size[0] - center_3d[0]
-            center_3d = affine_transform(center_3d.reshape(-1), trans)
-
-            # filter 3d center out of img
-            proj_inside_img = True
-
-            if center_3d[0] < 0 or center_3d[0] >= self.resolution[0]:
-                proj_inside_img = False
-            if center_3d[1] < 0 or center_3d[1] >= self.resolution[1]:
-                proj_inside_img = False
-
-            if not proj_inside_img:
-                continue
-
-            # class
-            labels[i] = cur_obj["label"]
-
-            # encoding 2d/3d boxes
-            w, h = bbox_2d[2] - bbox_2d[0], bbox_2d[3] - bbox_2d[1]
-            size_2d[i] = 1.0 * w, 1.0 * h
-
-            center_2d_norm = center_2d / self.resolution
-            size_2d_norm = size_2d[i] / self.resolution
-
-            corner_2d_norm = corner_2d
-            corner_2d_norm[0:2] = corner_2d[0:2] / self.resolution
-            corner_2d_norm[2:4] = corner_2d[2:4] / self.resolution
-            center_3d_norm = center_3d / self.resolution
-
-            k, r = center_3d_norm[0] - corner_2d_norm[0], corner_2d_norm[2] - center_3d_norm[0]
-            t, b = center_3d_norm[1] - corner_2d_norm[1], corner_2d_norm[3] - center_3d_norm[1]
-
-            if k < 0 or r < 0 or t < 0 or b < 0:
-                continue
-
-            boxes[i] = center_2d_norm[0], center_2d_norm[1], size_2d_norm[0], size_2d_norm[1]
-            boxes_3d[i] = center_3d_norm[0], center_3d_norm[1], k, r, t, b
-
-            # encoding depth
-            depth[i] = cur_obj["location"][-1] * crop_scale
-
-            # encoding heading angle
-            heading_angle = calib.ry2alpha(cur_obj["rotation_y"], (bbox2d[i][0] + bbox2d[i][2]) / 2)
-            if heading_angle > np.pi:
-                heading_angle -= 2 * np.pi  # check range
-            if heading_angle < -np.pi:
-                heading_angle += 2 * np.pi
-            heading_bin[i], heading_res[i] = angle2class(heading_angle)
-
-            # encoding size_3d
-            src_size_3d[i] = np.array([cur_obj["dimensions"]], dtype=np.float32)
-            size_3d[i] = src_size_3d[i]
-
-            # filter out the samples with truncated or occluded
-            if cur_obj["truncated"] <= 0.5 and cur_obj["occluded"] <= 2:
-                mask_2d[i] = 1
-
-        # collect return data
-        targets_for_train = {
-            "labels": labels[mask_2d],
-            "boxes": boxes[mask_2d],
-            "boxes_3d": boxes_3d[mask_2d],
-            "depth": depth[mask_2d],
-            "size_2d": size_2d[mask_2d],
-            "size_3d": size_3d[mask_2d],
-            "heading_bin": heading_bin[mask_2d],
-            "heading_res": heading_res[mask_2d],
-        }
-
-        return img, targets_for_train, img_size
-
-    def _reformate_for_kitti_metric(self, annotations: dict[str, Any]) -> dict[str, np.array]:
+    def _reformate_for_kitti_metric(self, annotations: list[Any]) -> dict[str, np.array]:
         """Reformat the annotation for KITTI metric."""
         return {
             "name": np.array([obj["name"] for obj in annotations]),
@@ -305,3 +114,13 @@ def _reformate_for_kitti_metric(self, annotations: dict[str, Any]) -> dict[str,
             "occluded": np.array([obj["occluded"] for obj in annotations]),
             "truncated": np.array([obj["truncated"] for obj in annotations]),
         }
+
+    @staticmethod
+    def get_calib_from_file(calib_file: str) -> np.ndarray:
+        """Get calibration matrix from txt file (KITTI format)."""
+        with open(calib_file) as f:  # noqa: PTH123
+            lines = f.readlines()
+
+        obj = lines[2].strip().split(" ")[1:]
+
+        return np.array(obj, dtype=np.float32).reshape(3, 4)
diff --git a/src/otx/core/data/dataset/segmentation.py b/src/otx/core/data/dataset/segmentation.py
index c0a976b8e40..90cb166c3c3 100644
--- a/src/otx/core/data/dataset/segmentation.py
+++ b/src/otx/core/data/dataset/segmentation.py
@@ -99,7 +99,7 @@ def _extract_class_mask(item: DatasetItem, img_shape: tuple[int, int], ignore_in
         msg = "It is not currently support an ignore index which is more than 255."
         raise ValueError(msg, ignore_index)
 
-    # fill mask with background label if we have Polygon/Ellipse annotations
+    # fill mask with background label if we have Polygon/Ellipse/Bbox annotations
     fill_value = 0 if isinstance(item.annotations[0], (Ellipse, Polygon, Bbox, RotatedBbox)) else ignore_index
     class_mask = np.full(shape=img_shape[:2], fill_value=fill_value, dtype=np.uint8)
 
@@ -180,9 +180,9 @@ def __init__(
             to_tv_image,
         )
 
-        if self.has_polygons and "background" not in [label_name.lower() for label_name in self.label_info.label_names]:
+        if self.has_polygons:
             # insert background class at index 0 since polygons represent only objects
-            self.label_info.label_names.insert(0, "background")
+            self.label_info.label_names.insert(0, "otx_background_lbl")
 
         self.label_info = SegLabelInfo(
             label_names=self.label_info.label_names,
diff --git a/src/otx/core/data/dataset/utils/kitti_utils.py b/src/otx/core/data/dataset/utils/kitti_utils.py
deleted file mode 100644
index 1ee16c41733..00000000000
--- a/src/otx/core/data/dataset/utils/kitti_utils.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-#
-"""Module defines utils for KITTI Dataset."""
-
-# flake8: noqa
-# mypy: ignore-errors
-
-import cv2
-import numpy as np
-
-
-def get_calib_from_file(calib_file):
-    with open(calib_file) as f:
-        lines = f.readlines()
-
-    obj = lines[2].strip().split(" ")[1:]
-    P2 = np.array(obj, dtype=np.float32)
-    obj = lines[3].strip().split(" ")[1:]
-    P3 = np.array(obj, dtype=np.float32)
-    obj = lines[4].strip().split(" ")[1:]
-    R0 = np.array(obj, dtype=np.float32)
-    obj = lines[5].strip().split(" ")[1:]
-    Tr_velo_to_cam = np.array(obj, dtype=np.float32)
-
-    return {
-        "P2": P2.reshape(3, 4),
-        "P3": P3.reshape(3, 4),
-        "R0": R0.reshape(3, 3),
-        "Tr_velo2cam": Tr_velo_to_cam.reshape(3, 4),
-    }
-
-
-class Calibration:
-    def __init__(self, calib_file):
-        if isinstance(calib_file, str):
-            calib = get_calib_from_file(calib_file)
-        else:
-            calib = calib_file
-
-        self.P2 = calib["P2"]  # 3 x 4
-        self.R0 = calib["R0"]  # 3 x 3
-        self.V2C = calib["Tr_velo2cam"]  # 3 x 4
-        self.C2V = self.inverse_rigid_trans(self.V2C)
-
-        # Camera intrinsics and extrinsics
-        self.cu = self.P2[0, 2]
-        self.cv = self.P2[1, 2]
-        self.fu = self.P2[0, 0]
-        self.fv = self.P2[1, 1]
-        self.tx = self.P2[0, 3] / (-self.fu)
-        self.ty = self.P2[1, 3] / (-self.fv)
-
-    def cart_to_hom(self, pts):
-        """:param pts: (N, 3 or 2)
-        :return pts_hom: (N, 4 or 3)
-        """
-        pts_hom = np.hstack((pts, np.ones((pts.shape[0], 1), dtype=np.float32)))
-        return pts_hom
-
-    def lidar_to_rect(self, pts_lidar):
-        """:param pts_lidar: (N, 3)
-        :return pts_rect: (N, 3)
-        """
-        pts_lidar_hom = self.cart_to_hom(pts_lidar)
-        pts_rect = np.dot(pts_lidar_hom, np.dot(self.V2C.T, self.R0.T))
-        # pts_rect = reduce(np.dot, (pts_lidar_hom, self.V2C.T, self.R0.T))
-        return pts_rect
-
-    def rect_to_lidar(self, pts_rect):
-        pts_ref = np.transpose(np.dot(np.linalg.inv(self.R0), np.transpose(pts_rect)))
-        pts_ref = self.cart_to_hom(pts_ref)  # nx4
-        return np.dot(pts_ref, np.transpose(self.C2V))
-
-    def rect_to_img(self, pts_rect):
-        """:param pts_rect: (N, 3)
-        :return pts_img: (N, 2)
-        """
-        pts_rect_hom = self.cart_to_hom(pts_rect)
-        pts_2d_hom = np.dot(pts_rect_hom, self.P2.T)
-        pts_img = (pts_2d_hom[:, 0:2].T / pts_rect_hom[:, 2]).T  # (N, 2)
-        pts_rect_depth = pts_2d_hom[:, 2] - self.P2.T[3, 2]  # depth in rect camera coord
-        return pts_img, pts_rect_depth
-
-    def lidar_to_img(self, pts_lidar):
-        """:param pts_lidar: (N, 3)
-        :return pts_img: (N, 2)
-        """
-        pts_rect = self.lidar_to_rect(pts_lidar)
-        pts_img, pts_depth = self.rect_to_img(pts_rect)
-        return pts_img, pts_depth
-
-    def img_to_rect(self, u, v, depth_rect):
-        """:param u: (N)
-        :param v: (N)
-        :param depth_rect: (N)
-        :return:
-        """
-        x = ((u - self.cu) * depth_rect) / self.fu + self.tx
-        y = ((v - self.cv) * depth_rect) / self.fv + self.ty
-        pts_rect = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1), depth_rect.reshape(-1, 1)), axis=1)
-        return pts_rect
-
-    def depthmap_to_rect(self, depth_map):
-        """:param depth_map: (H, W), depth_map
-        :return:
-        """
-        x_range = np.arange(0, depth_map.shape[1])
-        y_range = np.arange(0, depth_map.shape[0])
-        x_idxs, y_idxs = np.meshgrid(x_range, y_range)
-        x_idxs, y_idxs = x_idxs.reshape(-1), y_idxs.reshape(-1)
-        depth = depth_map[y_idxs, x_idxs]
-        pts_rect = self.img_to_rect(x_idxs, y_idxs, depth)
-        return pts_rect, x_idxs, y_idxs
-
-    def corners3d_to_img_boxes(self, corners3d):
-        """:param corners3d: (N, 8, 3) corners in rect coordinate
-        :return: boxes: (None, 4) [x1, y1, x2, y2] in rgb coordinate
-        :return: boxes_corner: (None, 8) [xi, yi] in rgb coordinate
-        """
-        sample_num = corners3d.shape[0]
-        corners3d_hom = np.concatenate((corners3d, np.ones((sample_num, 8, 1))), axis=2)  # (N, 8, 4)
-
-        img_pts = np.matmul(corners3d_hom, self.P2.T)  # (N, 8, 3)
-
-        x, y = img_pts[:, :, 0] / img_pts[:, :, 2], img_pts[:, :, 1] / img_pts[:, :, 2]
-        x1, y1 = np.min(x, axis=1), np.min(y, axis=1)
-        x2, y2 = np.max(x, axis=1), np.max(y, axis=1)
-
-        boxes = np.concatenate((x1.reshape(-1, 1), y1.reshape(-1, 1), x2.reshape(-1, 1), y2.reshape(-1, 1)), axis=1)
-        boxes_corner = np.concatenate((x.reshape(-1, 8, 1), y.reshape(-1, 8, 1)), axis=2)
-
-        return boxes, boxes_corner
-
-    def camera_dis_to_rect(self, u, v, d):
-        """Can only process valid u, v, d, which means u, v can not beyond the image shape, reprojection error 0.02
-        :param u: (N)
-        :param v: (N)
-        :param d: (N), the distance between camera and 3d points, d^2 = x^2 + y^2 + z^2
-        :return:
-        """
-        assert self.fu == self.fv, "%.8f != %.8f" % (self.fu, self.fv)
-        fd = np.sqrt((u - self.cu) ** 2 + (v - self.cv) ** 2 + self.fu**2)
-        x = ((u - self.cu) * d) / fd + self.tx
-        y = ((v - self.cv) * d) / fd + self.ty
-        z = np.sqrt(d**2 - x**2 - y**2)
-        pts_rect = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1), z.reshape(-1, 1)), axis=1)
-        return pts_rect
-
-    def inverse_rigid_trans(self, Tr):
-        """Inverse a rigid body transform matrix (3x4 as [R|t])
-        [R'|-R't; 0|1]
-        """
-        inv_Tr = np.zeros_like(Tr)  # 3x4
-        inv_Tr[0:3, 0:3] = np.transpose(Tr[0:3, 0:3])
-        inv_Tr[0:3, 3] = np.dot(-np.transpose(Tr[0:3, 0:3]), Tr[0:3, 3])
-        return inv_Tr
-
-    def alpha2ry(self, alpha, u):
-        """Get rotation_y by alpha + theta - 180
-        alpha : Observation angle of object, ranging [-pi..pi]
-        x : Object center x to the camera center (x-W/2), in pixels
-        rotation_y : Rotation ry around Y-axis in camera coordinates [-pi..pi]
-        """
-        ry = alpha + np.arctan2(u - self.cu, self.fu)
-
-        if ry > np.pi:
-            ry -= 2 * np.pi
-        if ry < -np.pi:
-            ry += 2 * np.pi
-
-        return ry
-
-    def ry2alpha(self, ry, u):
-        alpha = ry - np.arctan2(u - self.cu, self.fu)
-
-        if alpha > np.pi:
-            alpha -= 2 * np.pi
-        if alpha < -np.pi:
-            alpha += 2 * np.pi
-
-        return alpha
-
-    def flip(self, img_size):
-        wsize = 4
-        hsize = 2
-        p2ds = (
-            np.concatenate(
-                [
-                    np.expand_dims(np.tile(np.expand_dims(np.linspace(0, img_size[0], wsize), 0), [hsize, 1]), -1),
-                    np.expand_dims(np.tile(np.expand_dims(np.linspace(0, img_size[1], hsize), 1), [1, wsize]), -1),
-                    np.linspace(2, 78, wsize * hsize).reshape(hsize, wsize, 1),
-                ],
-                -1,
-            )
-        ).reshape(-1, 3)
-        p3ds = self.img_to_rect(p2ds[:, 0:1], p2ds[:, 1:2], p2ds[:, 2:3])
-        p3ds[:, 0] *= -1
-        p2ds[:, 0] = img_size[0] - p2ds[:, 0]
-
-        # self.P2[0,3] *= -1
-        cos_matrix = np.zeros([wsize * hsize, 2, 7])
-        cos_matrix[:, 0, 0] = p3ds[:, 0]
-        cos_matrix[:, 0, 1] = cos_matrix[:, 1, 2] = p3ds[:, 2]
-        cos_matrix[:, 1, 0] = p3ds[:, 1]
-        cos_matrix[:, 0, 3] = cos_matrix[:, 1, 4] = 1
-        cos_matrix[:, :, -2] = -p2ds[:, :2]
-        cos_matrix[:, :, -1] = -p2ds[:, :2] * p3ds[:, 2:3]
-        new_calib = np.linalg.svd(cos_matrix.reshape(-1, 7))[-1][-1]
-        new_calib /= new_calib[-1]
-
-        new_calib_matrix = np.zeros([4, 3]).astype(np.float32)
-        new_calib_matrix[0, 0] = new_calib_matrix[1, 1] = new_calib[0]
-        new_calib_matrix[2, 0:2] = new_calib[1:3]
-        new_calib_matrix[3, :] = new_calib[3:6]
-        new_calib_matrix[-1, -1] = self.P2[-1, -1]
-        self.P2 = new_calib_matrix.T
-        self.cu = self.P2[0, 2]
-        self.cv = self.P2[1, 2]
-        self.fu = self.P2[0, 0]
-        self.fv = self.P2[1, 1]
-        self.tx = self.P2[0, 3] / (-self.fu)
-        self.ty = self.P2[1, 3] / (-self.fv)
-
-
-def get_dir(src_point, rot_rad):
-    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
-
-    src_result = [0, 0]
-    src_result[0] = src_point[0] * cs - src_point[1] * sn
-    src_result[1] = src_point[0] * sn + src_point[1] * cs
-
-    return src_result
-
-
-def get_3rd_point(a, b):
-    direct = a - b
-    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
-
-
-def get_affine_transform(center, scale, rot, output_size, shift=np.array([0, 0], dtype=np.float32), inv=0):
-    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
-        scale = np.array([scale, scale], dtype=np.float32)
-
-    scale_tmp = scale
-    src_w = scale_tmp[0]
-    dst_w = output_size[0]
-    dst_h = output_size[1]
-
-    rot_rad = np.pi * rot / 180
-    src_dir = get_dir([0, src_w * -0.5], rot_rad)
-    dst_dir = np.array([0, dst_w * -0.5], np.float32)
-
-    src = np.zeros((3, 2), dtype=np.float32)
-    dst = np.zeros((3, 2), dtype=np.float32)
-    src[0, :] = center + scale_tmp * shift
-    src[1, :] = center + src_dir + scale_tmp * shift
-    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
-    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5], np.float32) + dst_dir
-
-    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
-    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
-
-    if inv:
-        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
-        trans_inv = cv2.getAffineTransform(np.float32(dst), np.float32(src))
-        return trans, trans_inv
-    else:
-        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
-    return trans
-
-
-def affine_transform(pt, t):
-    new_pt = np.array([pt[0], pt[1], 1.0], dtype=np.float32).T
-    new_pt = np.dot(t, new_pt)
-    return new_pt[:2]
-
-
-def angle2class(angle):
-    """Convert continuous angle to discrete class and residual."""
-    num_heading_bin = 12
-    angle = angle % (2 * np.pi)
-    assert angle >= 0 and angle <= 2 * np.pi
-    angle_per_class = 2 * np.pi / float(num_heading_bin)
-    shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi)
-    class_id = int(shifted_angle / angle_per_class)
-    residual_angle = shifted_angle - (class_id * angle_per_class + angle_per_class / 2)
-    return class_id, residual_angle
-
-
-def class2angle(cls: int, residual: float, to_label_format: bool = False) -> float:
-    """Inverse function to angle2class."""
-    num_heading_bin = 12
-    angle_per_class = 2 * np.pi / float(num_heading_bin)
-    angle_center = cls * angle_per_class
-    angle = angle_center + residual
-    if to_label_format and angle > np.pi:
-        angle = angle - 2 * np.pi
-    return angle
diff --git a/src/otx/core/data/entity/object_detection_3d.py b/src/otx/core/data/entity/object_detection_3d.py
index 564ea283a60..17d5d2687b8 100644
--- a/src/otx/core/data/entity/object_detection_3d.py
+++ b/src/otx/core/data/entity/object_detection_3d.py
@@ -20,17 +20,25 @@
 from otx.core.types.task import OTXTaskType
 
 if TYPE_CHECKING:
+    from numpy import ndarray
     from torch import LongTensor, Tensor
 
 
 @register_pytree_node
 @dataclass
 class Det3DDataEntity(OTXDataEntity):
-    """Data entity for detection task.
+    """Data entity for 3d object detection task.
+
+    : param boxes (tv_tensors.BoundingBoxes): The bounding boxes for the objects in the image.
+    : param calib_matrix (Tensor): The calibration matrix for the 3D object detection.
+    : param boxes_3d (Tensor): The 3D bounding boxes for the objects.
+    : param size_2d (Tensor): The 2D size of the objects.
+    : param size_3d (Tensor): The 3D size of the objects.
+    : param depth (Tensor): The depth of the objects.
+    : param heading_angle (Tensor): The heading angle of the objects.
+    : param labels (LongTensor): The labels of the objects.
+    : param original_kitti_format (list[dict[str, Any]] | None): The original KITTI format of the objects, if available.
 
-    :param bboxes: Bbox annotations as top-left-bottom-right
-        (x1, y1, x2, y2) format with absolute coordinate values
-    :param labels: Bbox labels as integer indices
     """
 
     @property
@@ -38,30 +46,37 @@ def task(self) -> OTXTaskType:
         """OTX Task type definition."""
         return OTXTaskType.OBJECT_DETECTION_3D
 
-    boxes: tv_tensors.BoundingBoxes
-    calib_matrix: Tensor
-    boxes_3d: Tensor
-    size_2d: Tensor
-    size_3d: Tensor
-    depth: Tensor
-    heading_angle: Tensor
-    labels: LongTensor
-    original_kitti_format: list[dict[str, Any]] | None
+    boxes: tv_tensors.BoundingBoxes | ndarray
+    calib_matrix: Tensor | ndarray
+    boxes_3d: Tensor | ndarray
+    size_2d: Tensor | ndarray
+    size_3d: Tensor | ndarray
+    depth: Tensor | ndarray
+    heading_angle: Tensor | ndarray
+    labels: LongTensor | ndarray
+    original_kitti_format: dict[str, Any] | None
 
 
 @dataclass
 class Det3DPredEntity(OTXPredEntity, Det3DDataEntity):
-    """Data entity to represent the detection model output prediction."""
+    """Data entity to represent the 3d object detection model output prediction."""
 
 
 @dataclass
 class Det3DBatchDataEntity(OTXBatchDataEntity[Det3DDataEntity]):
-    """Data entity for detection task.
-
-    :param bboxes: A list of bbox annotations as top-left-bottom-right
-        (x1, y1, x2, y2) format with absolute coordinate values
-    :param labels: A list of bbox labels as integer indices
-    """  # TODO(Kirill): UPDATE!
+    """Data entity for 3d object detection task.
+
+    : param boxes list[tv_tensors.BoundingBoxes]: The bounding boxes for the objects in the image.
+    : param calib_matrix list[Tensor]: The calibration matrix for the 3D object detection.
+    : param boxes_3d list[Tensor]: The 3D bounding boxes for the objects.
+    : param size_2d list[Tensor]: The 2D size of the objects.
+    : param size_3d list[Tensor]: The 3D size of the objects.
+    : param depth list[Tensor]: The depth of the objects.
+    : param heading_angle list[Tensor]: The heading angle of the objects.
+    : param labels list[LongTensor]: The labels of the objects.
+    : param original_kitti_format list[list[dict[str, Any]] | None]: The original KITTI format of the objects,
+        if available. Needed for validation and KITTI metric.
+    """
 
     images: Tensor
     boxes: list[tv_tensors.BoundingBoxes]
@@ -72,7 +87,7 @@ class Det3DBatchDataEntity(OTXBatchDataEntity[Det3DDataEntity]):
     depth: list[Tensor]
     heading_angle: list[Tensor]
     labels: list[LongTensor]
-    original_kitti_format: list[list[dict[str, Any]] | None]
+    original_kitti_format: list[dict[str, Any] | None]
 
     @property
     def task(self) -> OTXTaskType:
@@ -135,7 +150,7 @@ def pin_memory(self) -> Det3DBatchDataEntity:
 
 @dataclass
 class Det3DBatchPredEntity(OTXBatchPredEntity, Det3DBatchDataEntity):
-    """Data entity to represent model output predictions for detection task."""
+    """Data entity to represent model output predictions for 3d object detection task."""
 
     boxes: tv_tensors.BoundingBoxes
     scores: Tensor
diff --git a/src/otx/core/data/entity/utils.py b/src/otx/core/data/entity/utils.py
index 446f91eeee0..fe876d54672 100644
--- a/src/otx/core/data/entity/utils.py
+++ b/src/otx/core/data/entity/utils.py
@@ -34,7 +34,7 @@ class MulticlassClsDataEntity(OTXDataEntity):
     """
     flatten_fn = lambda obj: (list(obj.values()), list(obj.keys()))
     unflatten_fn = lambda values, context: cls(**dict(zip(context, values)))
-    pytree._register_pytree_node(  # noqa: SLF001
+    pytree.register_pytree_node(
         cls,
         flatten_fn=flatten_fn,
         unflatten_fn=unflatten_fn,
diff --git a/src/otx/core/data/pre_filtering.py b/src/otx/core/data/pre_filtering.py
index a3924c86752..13fc08c7ebc 100644
--- a/src/otx/core/data/pre_filtering.py
+++ b/src/otx/core/data/pre_filtering.py
@@ -105,5 +105,4 @@ def remove_unused_labels(
         mapping = {original_categories[idx]: original_categories[idx] for idx in used_labels}
     msg = "There are unused labels in dataset, they will be filtered out before training."
     warnings.warn(msg, stacklevel=2)
-
     return dataset.transform("remap_labels", mapping=mapping, default="delete")
diff --git a/src/otx/core/data/transform_libs/torchvision.py b/src/otx/core/data/transform_libs/torchvision.py
index 1c77ca2eb8e..1a6da801156 100644
--- a/src/otx/core/data/transform_libs/torchvision.py
+++ b/src/otx/core/data/transform_libs/torchvision.py
@@ -3406,7 +3406,7 @@ def _get_warp_image(
     ) -> torch.Tensor:
         numpy_image: np.ndarray = to_np_image(image)
         warped_image = cv2.warpAffine(numpy_image, warp_mat, warp_size, flags=cv2.INTER_LINEAR)
-        return torch.from_numpy(warped_image).permute(2, 0, 1)
+        return torch.from_numpy(warped_image).to(dtype=torch.float32).permute(2, 0, 1)
 
     def __call__(self, *_inputs: T_OTXDataEntity) -> T_OTXDataEntity | None:
         """Transform function to affine image through warp matrix."""
@@ -3449,6 +3449,335 @@ def __repr__(self) -> str:
         return repr_str
 
 
+class Decode3DInputsAffineTransforms(TopdownAffine):
+    """Transform function for 3D Object Detection to affine image through warp matrix.
+
+    This transform decode the input annotations and apply affine transforms.
+
+    Args:
+        input_size (tuple[int, int]): Input image size.
+        random_horizontal_flip (bool): Randomly flip the image horizontally.
+        random_crop (bool): Randomly crop the image.
+        decode_annotations (bool): Whether to decode the annotations.
+        p_crop (float): Probability of cropping.
+        random_scale (float): Randomly scale the image.
+        random_shift (float): Randomly shift the image.
+        depth_threshold (int): Threshold of depth.
+        max_objects (int): Maximum number of objects.
+    """
+
+    def __init__(
+        self,
+        input_size: tuple[int, int] | None = None,  # (H, W),
+        random_horizontal_flip: bool = False,
+        random_crop: bool = False,
+        decode_annotations: bool = True,
+        p_crop: float = 0.5,
+        p_flip: float = 0.5,
+        random_scale: float = 0.05,
+        random_shift: float = 0.05,
+        depth_threshold: int = 65,
+        max_objects: int = 50,
+    ) -> None:
+        self.input_size = input_size  # type: ignore[assignment]
+        self.random_horizontal_flip = random_horizontal_flip
+        self.random_crop = random_crop
+        self.decode_annotations = decode_annotations
+        self.p_crop = p_crop
+        self.p_flip = p_flip
+        self.random_scale = random_scale
+        self.random_shift = random_shift
+        self.depth_threshold = depth_threshold
+        self.max_objects = max_objects
+
+    def _affine_transforms(
+        self,
+        image: np.ndarray,
+        ori_img_size: np.ndarray,
+        warp_size: tuple[int, int],
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray, bool]:
+        """Get affine matrix and warp image.
+
+        Args:
+            image (np.ndarray): Input image.
+            ori_img_size (np.ndarray): Original image size.
+            warp_size (tuple[int, int]): Output image size.
+
+        Returns:
+            tuple[np.ndarray, np.ndarray, np.ndarray, bool]:
+                Affine matrix, warped image, and random flip flag.
+        """
+        center = ori_img_size / 2
+        crop_size, crop_scale = ori_img_size, 1
+        random_flip_flag = False
+        if self.random_crop and (np.random.random() <= self.p_crop):
+            crop_scale = np.clip(
+                np.random.randn() * self.random_scale + 1,
+                1 - self.random_scale,
+                1 + self.random_scale,
+            )
+            crop_size = ori_img_size * crop_scale
+            center[0] += ori_img_size[0] * np.clip(
+                np.random.randn() * self.random_shift,
+                -2 * self.random_shift,
+                2 * self.random_shift,
+            )
+            center[1] += ori_img_size[1] * np.clip(
+                np.random.randn() * self.random_shift,
+                -2 * self.random_shift,
+                2 * self.random_shift,
+            )
+
+        if self.random_horizontal_flip and (np.random.random() <= self.p_flip):
+            random_flip_flag = True
+            image = np.fliplr(image)
+
+        trans = self._get_warp_matrix(center, crop_size, 0, warp_size)
+        return self._get_warp_image(image, trans, warp_size), crop_scale, trans, random_flip_flag
+
+    def __call__(self, *_inputs: T_OTXDataEntity) -> T_OTXDataEntity | None:
+        """Transform __call__ function to affine image through warp matrix."""
+        inputs = _inputs[0]
+        ori_img_size = np.array(inputs.img_info.ori_shape)[::-1]
+        # labels encoding
+        src_size_3d = np.zeros((self.max_objects, 3), dtype=np.float32)
+        mask_2d = np.zeros((self.max_objects), dtype=bool)
+        if self.input_size is None:
+            # No need to resize (OV IR)
+            inputs.img_info.img_shape = ori_img_size
+            return self.convert(inputs, mask_2d, image_to_tensor=True)
+
+        annotations_list = inputs.original_kitti_format
+        h, w = self.input_size
+        warp_size = (int(w), int(h))
+        # transform image
+        inputs.image, crop_scale, trans, random_flip_flag = self._affine_transforms(
+            inputs.image,
+            ori_img_size,
+            warp_size,
+        )
+
+        if not self.decode_annotations:
+            # resize only (val/test)
+            inputs.img_info.img_shape = self.input_size
+            return self.convert(inputs, mask_2d)
+
+        # decode annotations
+        if random_flip_flag:
+            for i in range(len(annotations_list["bbox"])):
+                [x1, _, x2, _] = annotations_list["bbox"][i]
+                annotations_list["bbox"][i][0], annotations_list["bbox"][i][2] = (
+                    ori_img_size[0] - x2,
+                    ori_img_size[0] - x1,
+                )
+                annotations_list["alpha"][i] = np.pi - annotations_list["alpha"][i]
+                annotations_list["rotation_y"][i] = np.pi - annotations_list["rotation_y"][i]
+                if annotations_list["alpha"][i] > np.pi:
+                    annotations_list["alpha"][i] -= 2 * np.pi  # check range
+                if annotations_list["alpha"][i] < -np.pi:
+                    annotations_list["alpha"][i] += 2 * np.pi
+                if annotations_list["rotation_y"][i] > np.pi:
+                    annotations_list["rotation_y"][i] -= 2 * np.pi
+                if annotations_list["rotation_y"][i] < -np.pi:
+                    annotations_list["rotation_y"][i] += 2 * np.pi
+
+        object_num = (
+            len(annotations_list["bbox"]) if len(annotations_list["bbox"]) < self.max_objects else self.max_objects
+        )
+        for i in range(object_num):
+            # ignore the samples beyond the threshold [hard encoding]
+            if annotations_list["location"][i][-1] > self.depth_threshold and annotations_list["location"][i][-1] < 2:
+                continue
+
+            # process 2d bbox & get 2d center
+            bbox_2d = annotations_list["bbox"][i].copy()
+
+            # add affine transformation for 2d boxes.
+            bbox_2d[:2] = self.affine_transform(bbox_2d[:2], trans)
+            bbox_2d[2:] = self.affine_transform(bbox_2d[2:], trans)
+
+            # process 3d center
+            center_2d = np.array(
+                [(bbox_2d[0] + bbox_2d[2]) / 2, (bbox_2d[1] + bbox_2d[3]) / 2],
+                dtype=np.float32,
+            )  # W * H
+            corner_2d = bbox_2d.copy()
+
+            center_3d = np.array(
+                annotations_list["location"][i]
+                + [
+                    0,
+                    -annotations_list["dimensions"][i][1] / 2,
+                    0,
+                ],
+            )  # real 3D center in 3D space
+            center_3d = center_3d.reshape(-1, 3)  # shape adjustment (N, 3)
+            center_3d, _ = self.rect_to_img(inputs.calib_matrix, center_3d)  # project 3D center to image plane
+            center_3d = center_3d[0]  # shape adjustment
+            if random_flip_flag:  # random flip for center3d
+                center_3d[0] = ori_img_size[0] - center_3d[0]
+            center_3d = self.affine_transform(center_3d.reshape(-1), trans)
+
+            # filter 3d center out of img
+            proj_inside_img = True
+
+            if center_3d[0] < 0 or center_3d[0] >= warp_size[0]:
+                proj_inside_img = False
+            if center_3d[1] < 0 or center_3d[1] >= warp_size[1]:
+                proj_inside_img = False
+
+            if not proj_inside_img:
+                continue
+
+            # class
+            inputs.labels[i] = annotations_list["name"][i]
+
+            # encoding 2d/3d boxes
+            w, h = bbox_2d[2] - bbox_2d[0], bbox_2d[3] - bbox_2d[1]
+            inputs.size_2d[i] = 1.0 * w, 1.0 * h
+
+            center_2d_norm = center_2d / warp_size
+            size_2d_norm = inputs.size_2d[i] / warp_size
+
+            corner_2d_norm = corner_2d
+            corner_2d_norm[0:2] = corner_2d[0:2] / warp_size
+            corner_2d_norm[2:4] = corner_2d[2:4] / warp_size
+            center_3d_norm = center_3d / warp_size
+
+            k, r = center_3d_norm[0] - corner_2d_norm[0], corner_2d_norm[2] - center_3d_norm[0]
+            t, b = center_3d_norm[1] - corner_2d_norm[1], corner_2d_norm[3] - center_3d_norm[1]
+
+            if k < 0 or r < 0 or t < 0 or b < 0:
+                continue
+
+            inputs.boxes[i] = center_2d_norm[0], center_2d_norm[1], size_2d_norm[0], size_2d_norm[1]
+            inputs.boxes_3d[i] = center_3d_norm[0], center_3d_norm[1], k, r, t, b
+
+            # encoding depth
+            inputs.depth[i] = annotations_list["location"][i][-1] * crop_scale
+
+            # encoding heading angle
+            heading_angle = self.ry2alpha(
+                inputs.calib_matrix,
+                annotations_list["rotation_y"][i],
+                (annotations_list["bbox"][i][0] + annotations_list["bbox"][i][2]) / 2,
+            )
+            if heading_angle > np.pi:
+                heading_angle -= 2 * np.pi  # check range
+            if heading_angle < -np.pi:
+                heading_angle += 2 * np.pi
+            inputs.heading_angle[i] = self.angle2class(heading_angle)
+
+            # encoding size_3d
+            src_size_3d[i] = np.array(
+                [
+                    annotations_list["dimensions"][i][1],
+                    annotations_list["dimensions"][i][2],
+                    annotations_list["dimensions"][i][0],
+                ],
+                dtype=np.float32,
+            )
+            inputs.size_3d[i] = src_size_3d[i]
+
+            # filter out the samples with truncated or occluded
+            if annotations_list["truncated"][i] <= 0.5 and annotations_list["occluded"][i] <= 2:
+                mask_2d[i] = 1
+
+        # update img_info
+        inputs.img_info.img_shape = self.input_size
+
+        return self.convert(inputs, mask_2d)
+
+    @staticmethod
+    def affine_transform(pt: np.ndarray, t: np.ndarray) -> np.ndarray:
+        """Apply an affine transformation to the points."""
+        new_pt = np.array([pt[0], pt[1], 1.0], dtype=np.float32).T
+        new_pt = np.dot(t, new_pt)
+        return new_pt[:2]
+
+    @staticmethod
+    def rect_to_img(p2: np.ndarray, pts_rect: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+        """Convert camera coordinates to image coordinates.
+
+        Args:
+            p2 (np.ndarray): Projection matrix with shape (3, 4).
+            pts_rect (np.ndarray): Rectangular coordinates with shape (N, 4).
+
+        Returns:
+            tuple[np.ndarray, np.ndarray]: Image coordinates with shape (N, 2).
+        """
+
+        def cart_to_hom(pts: np.ndarray) -> np.ndarray:
+            """Convert Cartesian coordinates to homogeneous coordinates.
+
+            Args:
+                pts (np.ndarray): Array of Cartesian coordinates with shape (N, D),
+                    where N is the number of points and D is the number of dimensions.
+
+            Returns:
+                np.ndarray: Array of homogeneous coordinates with shape (N, D+1),
+                    where N is the number of points and D is the number of dimensions.
+            """
+            return np.hstack((pts, np.ones((pts.shape[0], 1), dtype=np.float32)))
+
+        pts_rect_hom = cart_to_hom(pts_rect)
+        pts_2d_hom = np.dot(pts_rect_hom, p2.T)
+        pts_img = (pts_2d_hom[:, 0:2].T / pts_rect_hom[:, 2]).T  # (N, 2)
+        pts_rect_depth = pts_2d_hom[:, 2] - p2.T[3, 2]  # depth in rect camera coord
+        return pts_img, pts_rect_depth
+
+    @staticmethod
+    def ry2alpha(p2: np.ndarray, ry: np.ndarray, u: np.ndarray) -> np.ndarray:
+        """Get observation angle of object.
+
+        Args:
+            p2 (np.ndarray): Projection matrix with shape (3, 4).
+            ry (np.ndarray): Observation angle of object with shape (N, ).
+            u (np.ndarray): Pixel coordinates with shape (N, 2).
+
+        Returns:
+            np.ndarray: Observation angle of object with shape (N, ).
+        """
+        alpha = ry - np.arctan2(u - p2[0, 2], p2[0, 0])
+
+        if alpha > np.pi:
+            alpha -= 2 * np.pi
+        if alpha < -np.pi:
+            alpha += 2 * np.pi
+
+        return alpha
+
+    @staticmethod
+    def angle2class(angle: float) -> tuple[int, float]:
+        """Convert continuous angle to discrete class and residual."""
+        num_heading_bin = 12
+        angle = angle % (2 * np.pi)
+        if not (angle >= 0 and angle <= 2 * np.pi):
+            msg = "angle not in 0 ~ 2pi"
+            raise ValueError(msg)
+
+        angle_per_class = 2 * np.pi / float(num_heading_bin)
+        shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi)
+        class_id = int(shifted_angle / angle_per_class)
+        residual_angle = shifted_angle - (class_id * angle_per_class + angle_per_class / 2)
+        return class_id, residual_angle
+
+    def convert(self, inputs: T_OTXDataEntity, mask_2d: np.ndarray, image_to_tensor: bool = False) -> T_OTXDataEntity:  # type: ignore[override]
+        """Convert the data entity to torchvision format."""
+        if image_to_tensor:
+            inputs.image = torch.from_numpy(inputs.image).permute(2, 0, 1)
+        inputs.labels = torch.as_tensor(inputs.labels[mask_2d], dtype=torch.long)
+        inputs.boxes = tv_tensors.BoundingBoxes(inputs.boxes[mask_2d], format="XYXY", canvas_size=self.input_size)
+        inputs.boxes_3d = torch.as_tensor(inputs.boxes_3d[mask_2d], dtype=torch.float32)
+        inputs.size_2d = torch.as_tensor(inputs.size_2d[mask_2d], dtype=torch.float32)
+        inputs.size_3d = torch.as_tensor(inputs.size_3d[mask_2d], dtype=torch.float32)
+        inputs.depth = torch.as_tensor(inputs.depth[mask_2d], dtype=torch.float32)
+        inputs.heading_angle = torch.as_tensor(inputs.heading_angle[mask_2d], dtype=torch.float32)
+        inputs.calib_matrix = torch.as_tensor(inputs.calib_matrix, dtype=torch.float32)
+
+        return inputs
+
+
 class TorchVisionTransformLib:
     """Helper to support TorchVision transforms (only V2) in OTX."""
 
diff --git a/src/otx/core/exporter/base.py b/src/otx/core/exporter/base.py
index 85d77fe4799..cfbc670e58e 100644
--- a/src/otx/core/exporter/base.py
+++ b/src/otx/core/exporter/base.py
@@ -45,6 +45,9 @@ class OTXModelExporter:
         output_names (list[str] | None, optional): Names for model's outputs, which would be
         embedded into resulting model. Note, that order of the output names should be the same,
         as in the target model.
+        input_names (list[str] | None, optional): Names for model's inputs, which would be
+        embedded into resulting model. Note, that order of the input names should be the same,
+        as in the target model.
     """
 
     def __init__(
diff --git a/src/otx/core/exporter/detection_3d.py b/src/otx/core/exporter/detection_3d.py
index 17b1377436a..1c33f5130ed 100644
--- a/src/otx/core/exporter/detection_3d.py
+++ b/src/otx/core/exporter/detection_3d.py
@@ -98,3 +98,26 @@ def to_onnx(
         log.info("Converting to ONNX is done.")
 
         return Path(save_path)
+
+    def to_exportable_code(
+        self,
+        model: OTXModel,
+        output_dir: Path,
+        base_model_name: str = "exported_model",
+        precision: OTXPrecisionType = OTXPrecisionType.FP32,
+    ) -> Path:
+        """Export to zip folder final OV IR model with runable demo.
+
+        NOT SUPPORTED FOR OD 3D. It will raise an error.
+
+        Args:
+            model (OTXModel): OTXModel to be exported
+            output_dir (Path): path to the directory to store export artifacts
+            base_model_name (str, optional): exported model name
+            precision (OTXExportPrecisionType, optional): precision of the exported model's weights
+
+        Returns:
+            Path: path to the exported model.
+        """
+        msg = "Exportable code option is not supported for Object Detection 3D."
+        raise NotImplementedError(msg)
diff --git a/src/otx/core/exporter/exportable_code/demo/requirements.txt b/src/otx/core/exporter/exportable_code/demo/requirements.txt
index c816a10f57c..cb535522808 100644
--- a/src/otx/core/exporter/exportable_code/demo/requirements.txt
+++ b/src/otx/core/exporter/exportable_code/demo/requirements.txt
@@ -1,3 +1,3 @@
 openvino==2024.4.0
-openvino-model-api==0.2.4
+openvino-model-api==0.2.5
 numpy==1.26.4
diff --git a/src/otx/core/metrics/accuracy.py b/src/otx/core/metrics/accuracy.py
index 7c2aeb975bf..62c61da610b 100644
--- a/src/otx/core/metrics/accuracy.py
+++ b/src/otx/core/metrics/accuracy.py
@@ -290,12 +290,17 @@ def __init__(
         ]
 
         # Multilabel classification accuracy metrics
-        if self.num_multilabel_classes > 0:
+        # https://github.com/Lightning-AI/torchmetrics/blob/6377aa5b6fe2863761839e6b8b5a857ef1b8acfa/src/torchmetrics/functional/classification/stat_scores.py#L583-L584
+        # MultilabelAccuracy is available when num_multilabel_classes is greater than 2.
+        self.multilabel_accuracy = None
+        if self.num_multilabel_classes > 1:
             self.multilabel_accuracy = TorchmetricMultilabelAcc(
                 num_labels=self.num_multilabel_classes,
                 threshold=0.5,
                 average="macro",
             )
+        elif self.num_multilabel_classes == 1:
+            self.multilabel_accuracy = TorchmetricAcc(task="binary", num_classes=self.num_multilabel_classes)
 
     def _apply(self, fn: Callable, exclude_state: Sequence[str] = "") -> nn.Module:
         self.multiclass_head_accuracy = [
@@ -305,7 +310,7 @@ def _apply(self, fn: Callable, exclude_state: Sequence[str] = "") -> nn.Module:
             )
             for acc in self.multiclass_head_accuracy
         ]
-        if self.num_multilabel_classes > 0:
+        if self.multilabel_accuracy is not None:
             self.multilabel_accuracy = self.multilabel_accuracy._apply(fn, exclude_state)  # noqa: SLF001
         return self
 
@@ -324,7 +329,7 @@ def update(self, preds: torch.Tensor, target: torch.Tensor) -> None:
                     target_multiclass[multiclass_mask],
                 )
 
-        if self.num_multilabel_classes > 0:
+        if self.multilabel_accuracy is not None:
             # Split preds into multiclass and multilabel parts
             preds_multilabel = preds[:, self.num_multiclass_heads :]
             target_multilabel = target[:, self.num_multiclass_heads :]
@@ -339,7 +344,7 @@ def compute(self) -> torch.Tensor:
             ),
         )
 
-        if self.num_multilabel_classes > 0:
+        if self.multilabel_accuracy is not None:
             multilabel_acc = self.multilabel_accuracy.compute()
 
             return (multiclass_accs + multilabel_acc) / 2
diff --git a/src/otx/core/metrics/average_precision_3d.py b/src/otx/core/metrics/average_precision_3d.py
index 7b8530ba684..2600200280b 100644
--- a/src/otx/core/metrics/average_precision_3d.py
+++ b/src/otx/core/metrics/average_precision_3d.py
@@ -7,8 +7,10 @@
 
 from typing import TYPE_CHECKING
 
+import torch
 from torch import Tensor
 from torchmetrics import Metric
+from torchmetrics.detection.mean_ap import MeanAveragePrecision
 
 from otx.core.metrics.kitti_3d_eval import get_coco_eval_result
 
@@ -32,6 +34,7 @@ def __init__(
         super().__init__()
 
         self.label_info: LabelInfo = label_info
+        self.mean_ap: MeanAveragePrecision = MeanAveragePrecision(box_format="xyxy", iou_type="bbox")
         self.reset()
 
     def reset(self) -> None:
@@ -42,6 +45,7 @@ def reset(self) -> None:
         super().reset()
         self.preds: list[dict[str, np.array]] = []
         self.targets: list[dict[str, np.array]] = []
+        self.mean_ap.reset()
 
     def update(self, preds: list[dict[str, Tensor]], target: list[dict[str, Tensor]]) -> None:
         """Update total predictions and targets from given batch predicitons and targets."""
@@ -51,13 +55,35 @@ def update(self, preds: list[dict[str, Tensor]], target: list[dict[str, Tensor]]
     def compute(self) -> dict:
         """Compute metrics for 3d object detection."""
         current_classes = self.label_info.label_names
-        map_bbox, map_3d = get_coco_eval_result(
+        preds_for_torchmetrics = self.prepare_inputs_for_map_coco(self.preds)
+        targets_for_torchmetrics = self.prepare_inputs_for_map_coco(self.targets)
+        ap_bbox_coco = self.mean_ap(preds_for_torchmetrics, targets_for_torchmetrics)
+        ap_3d = get_coco_eval_result(
             self.targets,
             self.preds,
             current_classes=[curcls.lower() for curcls in current_classes],
         )
-        # use moderate difficulty as final score. Average across all calsses.
-        return {"mAP_bbox_3d": Tensor([map_3d[:, 1].mean()]), "mAP_bbox_2d": Tensor([map_bbox[:, 1].mean()])}
+        # Average across all classes.
+        return {
+            "AP_3d@0.5": Tensor([ap_3d[0]]),
+            "AP_2d@0.5": ap_bbox_coco["map_50"],
+            "mAP_3d": Tensor([ap_3d.mean()]),
+            "mAP_2d": ap_bbox_coco["map"],
+        }
+
+    def prepare_inputs_for_map_coco(self, targets: list[dict[str, np.array]]) -> list[dict[str, Tensor]]:
+        """Prepare targets for torchmetrics."""
+        return [
+            {
+                "boxes": torch.tensor(target["bbox"]),
+                "scores": torch.tensor(target["score"]) if "score" in target else None,
+                "labels": torch.tensor(
+                    [self.label_info.label_names.index(label) for label in target["name"]],
+                    dtype=torch.long,
+                ),
+            }
+            for target in targets
+        ]
 
 
 def _kitti_metric_measure_callable(label_info: LabelInfo) -> KittiMetric:
diff --git a/src/otx/core/metrics/kitti_3d_eval/eval.py b/src/otx/core/metrics/kitti_3d_eval/eval.py
index 951cc96538d..34144fa4797 100644
--- a/src/otx/core/metrics/kitti_3d_eval/eval.py
+++ b/src/otx/core/metrics/kitti_3d_eval/eval.py
@@ -3,12 +3,10 @@
 #
 """KITTI 3D eval for OTX."""
 
-# flake8: noqa
-# mypy: ignore-errors
 
 from __future__ import annotations
 
-import io as sysio
+import logging
 from typing import Any
 
 import numba
@@ -21,47 +19,11 @@
     from .rotate_iou import rotate_iou_eval_cpu as rotate_iou_eval
 
 
-@numba.jit(nopython=True)
-def get_thresholds(
-    scores: np.ndarray,  # 1D array of confidence scores
-    num_gt: int,  # Number of ground truth objects
-    num_sample_pts: int = 41,  # Number of sample points used to compute recall thresholds
-) -> np.ndarray:  # 1D array of recall thresholds
-    """Compute recall thresholds for a given score array.
-
-    Args:
-        scores (np.ndarray): 1D array of confidence scores.
-        num_gt (int): Number of ground truth objects.
-        num_sample_pts (int, optional): Number of sample points used to
-            compute recall thresholds. Defaults to 41.
-
-    Returns:
-        np.ndarray: 1D array of recall thresholds.
-    """
-    scores.sort()
-    scores = scores[::-1]
-    current_recall = 0
-    thresholds = []
-    for i, score in enumerate(scores):
-        l_recall = (i + 1) / num_gt
-        if i < (len(scores) - 1):
-            r_recall = (i + 2) / num_gt
-        else:
-            r_recall = l_recall
-        if ((r_recall - current_recall) < (current_recall - l_recall)) and (i < (len(scores) - 1)):
-            continue
-        # recall = l_recall
-        thresholds.append(score)
-        current_recall += 1 / (num_sample_pts - 1.0)
-    return thresholds
-
-
 def clean_data(
     gt_anno: dict,  # ground truth annotations
     dt_anno: dict,  # detection results
     current_class: str,  # the current class name
-    difficulty: int,  # the difficulty level
-) -> tuple:  # (num_valid_gt, ignored_gt, ignored_dt, dc_bboxes)
+) -> tuple:  # (num_valid_gt, ignored_gt, ignored_dt)
     """Filter out the objects that are not in the current class.
 
     Args:
@@ -71,12 +33,12 @@ def clean_data(
         difficulty (int): The difficulty level.
 
     Returns:
-        tuple: The number of valid objects, ignored_gt, ignored_dt, and dc_bboxes.
+        tuple: The number of valid objects, ignored_gt, ignored_dt.
     """
-    MIN_HEIGHT = [40, 25, 25]
-    MAX_OCCLUSION = [0, 1, 2]
-    MAX_TRUNCATION = [0.15, 0.3, 0.5]
-    dc_bboxes, ignored_gt, ignored_dt = [], [], []
+    min_height = 20
+    max_occlusion = 2
+    max_truncation = 0.5
+    ignored_gt, ignored_dt = [], []
     num_gt = len(gt_anno["name"])
     num_dt = len(dt_anno["name"])
     num_valid_gt = 0
@@ -87,19 +49,18 @@ def clean_data(
         valid_class = -1
         if gt_name == current_class:
             valid_class = 1
-        elif current_class == "Pedestrian".lower() and "Person_sitting".lower() == gt_name:
-            valid_class = 0
-        elif current_class == "Car".lower() and "Van".lower() == gt_name:
+        elif (current_class == "Pedestrian".lower() and "Person_sitting".lower() == gt_name) or (
+            current_class == "Car".lower() and "Van".lower() == gt_name
+        ):
             valid_class = 0
         else:
             valid_class = -1
         ignore = False
         if (
-            (gt_anno["occluded"][i] > MAX_OCCLUSION[difficulty])
-            or (gt_anno["truncated"][i] > MAX_TRUNCATION[difficulty])
-            or (height <= MIN_HEIGHT[difficulty])
-        ):
-            # if gt_anno["difficulty"][i] > difficulty or gt_anno["difficulty"][i] == -1:
+            (gt_anno["occluded"][i] > max_occlusion)
+            or (gt_anno["truncated"][i] > max_truncation)
+            or (height <= min_height)
+        ):  # filter extrim cases
             ignore = True
         if valid_class == 1 and not ignore:
             ignored_gt.append(0)
@@ -108,74 +69,35 @@ def clean_data(
             ignored_gt.append(1)
         else:
             ignored_gt.append(-1)
-        # for i in range(num_gt):
-        if gt_anno["name"][i] == "DontCare":
-            dc_bboxes.append(gt_anno["bbox"][i])
+
     for i in range(num_dt):
-        if dt_anno["name"][i].lower() == current_class:
-            valid_class = 1
-        else:
-            valid_class = -1
+        valid_class = 1 if dt_anno["name"][i].lower() == current_class else -1
         height = abs(dt_anno["bbox"][i, 3] - dt_anno["bbox"][i, 1])
-        if height < MIN_HEIGHT[difficulty]:
+        if height < min_height:
             ignored_dt.append(1)
         elif valid_class == 1:
             ignored_dt.append(0)
         else:
             ignored_dt.append(-1)
 
-    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes
-
-
-@numba.jit(nopython=True)
-def image_box_overlap(
-    boxes: np.ndarray,  # shape: (N, 4)
-    query_boxes: np.ndarray,  # shape: (K, 4)
-    criterion: int = -1,  # default overlap criterion, -1: intersection over union, 0: intersection over box area, 1: intersection over query box area
-) -> np.ndarray:  # shape: (N, K)
-    """Args:
-        boxes (np.ndarray): shape: (N, 4), 2D boxes, (x1, y1, x2, y2)
-        query_boxes (np.ndarray): shape: (K, 4), 2D boxes, (x1, y1, x2, y2)
-        criterion (int, optional): overlap criterion, -1: intersection over union, 0: intersection over box area, 1: intersection over query box area. Defaults to -1.
-
-    Returns:
-        np.ndarray: shape: (N, K), overlap between boxes and query_boxes
-    """
-    N = boxes.shape[0]
-    K = query_boxes.shape[0]
-    overlaps = np.zeros((N, K), dtype=boxes.dtype)
-    for k in range(K):
-        qbox_area = (query_boxes[k, 2] - query_boxes[k, 0]) * (query_boxes[k, 3] - query_boxes[k, 1])
-        for n in range(N):
-            iw = min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0])
-            if iw > 0:
-                ih = min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1])
-                if ih > 0:
-                    if criterion == -1:
-                        ua = (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1]) + qbox_area - iw * ih
-                    elif criterion == 0:
-                        ua = (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1])
-                    elif criterion == 1:
-                        ua = qbox_area
-                    else:
-                        ua = 1.0
-                    overlaps[n, k] = iw * ih / ua
-    return overlaps
+    return num_valid_gt, ignored_gt, ignored_dt
 
 
 @numba.jit(nopython=True)
 def d3_box_overlap_kernel(
-    boxes: np.ndarray,  # shape: (N, 7)
-    qboxes: np.ndarray,  # shape: (K, 7)
-    rinc: np.ndarray,  # shape: (N, K)
+    boxes: np.ndarray,  # shape: (n, 7)
+    qboxes: np.ndarray,  # shape: (k, 7)
+    rinc: np.ndarray,  # shape: (n, k)
     criterion: int = -1,  # default overlap criterion
 ) -> None:
-    """Args:
-        boxes: Array of shape (N, 7) representing N 3D boxes.
-        qboxes: Array of shape (K, 7) representing K 3D boxes.
-        rinc: Array of shape (N, K) representing the overlap between boxes
+    """Calculate 3D box overlap.
+
+    Args:
+        boxes (np.ndarray): Array of shape (n, 7) representing n 3D boxes.
+        qboxes (np.ndarray): Array of shape (k, 7) representing k 3D boxes.
+        rinc (np.ndarray): Array of shape (n, k) representing the overlap between boxes
             and qboxes.
-        criterion: Overlap criterion. Defaults to -1. If -1, uses the
+        criterion (int, optional): Overlap criterion. Defaults to -1. If -1, uses the
             intersection-over-union (IoU) criterion. If 0, uses the
             intersection-over-area1 criterion. If 1, uses the
             intersection-over-area2 criterion.
@@ -184,12 +106,10 @@ def d3_box_overlap_kernel(
         None
     """
     # ONLY support overlap in CAMERA, not lidar.
-    N, K = boxes.shape[0], qboxes.shape[0]
-    for i in range(N):
-        for j in range(K):
+    n, k = boxes.shape[0], qboxes.shape[0]
+    for i in range(n):
+        for j in range(k):
             if rinc[i, j] > 0:
-                # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] +
-                #         qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1]))
                 iw = min(boxes[i, 1], qboxes[j, 1]) - max(boxes[i, 1] - boxes[i, 4], qboxes[j, 1] - qboxes[j, 4])
 
                 if iw > 0:
@@ -216,12 +136,9 @@ def compute_statistics_jit(
     dt_datas: np.ndarray,  # shape: (total_dt_num, 7)
     ignored_gt: list[int],  # shape: (total_gt_num)
     ignored_det: list[int],  # shape: (total_dt_num)
-    dc_bboxes: np.ndarray,  # shape: (total_dc_num, 4)
-    metric: int,
     min_overlap: float,
     thresh: float = 0,
     compute_fp: bool = False,
-    compute_aos: bool = False,
 ) -> tuple[int, int, int, float, np.ndarray]:
     """This function computes statistics of an evaluation.
 
@@ -231,12 +148,9 @@ def compute_statistics_jit(
         dt_datas (np.ndarray): Detection data.
         ignored_gt (List[int]): Ignore ground truth indices.
         ignored_det (List[int]): Ignore detection indices.
-        dc_bboxes (np.ndarray): Don't care bboxes.
-        metric (int): Evaluation metric.
         min_overlap (float): Minimum overlap between dt and gt bboxes.
         thresh (float): Detection score threshold. Defaults to 0.
         compute_fp (bool): Whether to compute false positives. Defaults to False.
-        compute_aos (bool): Whether to compute average orientation similarity. Defaults to False.
 
     Returns:
         Tuple[int, int, int, float, np.ndarray]: tp, fp, fn, similarity, thresholds
@@ -244,29 +158,22 @@ def compute_statistics_jit(
     det_size = dt_datas.shape[0]
     gt_size = gt_datas.shape[0]
     dt_scores = dt_datas[:, -1]
-    dt_alphas = dt_datas[:, 4]
-    gt_alphas = gt_datas[:, 4]
-    dt_bboxes = dt_datas[:, :4]
 
     assigned_detection = [False] * det_size
-    ignored_threshold = [False] * det_size
+    ignored_obj_by_threshold = [False] * det_size
     if compute_fp:
         for i in range(det_size):
             if dt_scores[i] < thresh:
-                ignored_threshold[i] = True
-    NO_DETECTION = -10000000
+                ignored_obj_by_threshold[i] = True
+    no_detection = -10000000
     tp, fp, fn, similarity = 0, 0, 0, 0
-    # thresholds = [0.0]
-    # delta = [0.0]
-    thresholds = np.zeros((gt_size,))
+    tp_scores = np.zeros((gt_size,))
     thresh_idx = 0
-    delta = np.zeros((gt_size,))
-    delta_idx = 0
     for i in range(gt_size):
         if ignored_gt[i] == -1:
             continue
         det_idx = -1
-        valid_detection = NO_DETECTION
+        valid_detection = no_detection
         max_overlap = 0
         assigned_ignored_det = False
 
@@ -275,7 +182,7 @@ def compute_statistics_jit(
                 continue
             if assigned_detection[j]:
                 continue
-            if ignored_threshold[j]:
+            if ignored_obj_by_threshold[j]:
                 continue
             overlap = overlaps[j, i]
             dt_score = dt_scores[j]
@@ -292,58 +199,32 @@ def compute_statistics_jit(
                 det_idx = j
                 valid_detection = 1
                 assigned_ignored_det = False
-            elif compute_fp and (overlap > min_overlap) and (valid_detection == NO_DETECTION) and ignored_det[j] == 1:
+            elif compute_fp and (overlap > min_overlap) and (valid_detection == no_detection) and ignored_det[j] == 1:
                 det_idx = j
                 valid_detection = 1
                 assigned_ignored_det = True
 
-        if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0:
+        if (valid_detection == no_detection) and ignored_gt[i] == 0:
             fn += 1
-        elif (valid_detection != NO_DETECTION) and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1):
+        elif (valid_detection != no_detection) and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1):
             assigned_detection[det_idx] = True
-        elif valid_detection != NO_DETECTION:
+        elif valid_detection != no_detection:
             tp += 1
-            # thresholds.append(dt_scores[det_idx])
-            thresholds[thresh_idx] = dt_scores[det_idx]
+
+            tp_scores[thresh_idx] = dt_scores[det_idx]
             thresh_idx += 1
-            if compute_aos:
-                # delta.append(gt_alphas[i] - dt_alphas[det_idx])
-                delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx]
-                delta_idx += 1
 
             assigned_detection[det_idx] = True
     if compute_fp:
         for i in range(det_size):
-            if not (assigned_detection[i] or ignored_det[i] == -1 or ignored_det[i] == 1 or ignored_threshold[i]):
+            if not (
+                assigned_detection[i] or ignored_det[i] == -1 or ignored_det[i] == 1 or ignored_obj_by_threshold[i]
+            ):
                 fp += 1
         nstuff = 0
-        if metric == 0:
-            overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0)
-            for i in range(dc_bboxes.shape[0]):
-                for j in range(det_size):
-                    if assigned_detection[j]:
-                        continue
-                    if ignored_det[j] == -1 or ignored_det[j] == 1:
-                        continue
-                    if ignored_threshold[j]:
-                        continue
-                    if overlaps_dt_dc[j, i] > min_overlap:
-                        assigned_detection[j] = True
-                        nstuff += 1
         fp -= nstuff
-        if compute_aos:
-            tmp = np.zeros((fp + delta_idx,))
-            # tmp = [0] * fp
-            for i in range(delta_idx):
-                tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0
-                # tmp.append((1.0 + np.cos(delta[i])) / 2.0)
-            # assert len(tmp) == fp + tp
-            # assert len(delta) == tp
-            if tp > 0 or fp > 0:
-                similarity = np.sum(tmp)
-            else:
-                similarity = -1
-    return tp, fp, fn, similarity, thresholds[:thresh_idx]
+
+    return tp, fp, fn, similarity, tp_scores[:thresh_idx]
 
 
 @numba.jit(nopython=True)
@@ -364,8 +245,7 @@ def get_split_parts(num: int, num_part: int) -> list[int]:
 
     if remain_num == 0:
         return [same_part] * num_part
-    else:
-        return [same_part] * num_part + [remain_num]
+    return [same_part] * num_part + [remain_num]
 
 
 @numba.jit(nopython=True)
@@ -374,53 +254,42 @@ def fused_compute_statistics(
     pr: np.ndarray,  # shape: (num_thresholds, 4)
     gt_nums: np.ndarray,  # shape: (num_samples)
     dt_nums: np.ndarray,  # shape: (num_samples)
-    dc_nums: np.ndarray,  # shape: (num_samples)
     gt_datas: np.ndarray,  # shape: (total_gt_num, 7)
     dt_datas: np.ndarray,  # shape: (total_dt_num, 7)
-    dontcares: np.ndarray,  # shape: (total_dc_num, 4)
     ignored_gts: np.ndarray,  # shape: (total_gt_num)
     ignored_dets: np.ndarray,  # shape: (total_dt_num)
-    metric: int,
     min_overlap: float,
     thresholds: np.ndarray,  # shape: (num_thresholds)
-    compute_aos: bool = False,
 ) -> None:
     """Fast compute statistics. Must be used in CAMERA coordinate system.
 
     Args:
-    overlaps: 2D array of shape (total_dt_num, total_gt_num)
-    [dt_num, gt_num] is the overlap between dt_num-th detection
-    and gt_num-th ground truth
-    pr: 2D array of shape (num_thresholds, 4)
-    [t, 0] is the number of true positives at threshold t
-    [t, 1] is the number of false positives at threshold t
-    [t, 2] is the number of false negatives at threshold t
-    [t, 3] is the similarity at threshold t
-    gt_nums: 1D array of shape (num_samples)
-    gt_nums[i] is the number of ground truths in i-th sample
-    dt_nums: 1D array of shape (num_samples)
-    dt_nums[i] is the number of detections in i-th sample
-    dc_nums: 1D array of shape (num_samples)
-    dc_nums[i] is the number of dontcare areas in i-th sample
-    gt_datas: 2D array of shape (total_gt_num, 7)
-    gt_datas[i] is the i-th ground truth box
-    dt_datas: 2D array of shape (total_dt_num, 7)
-    dt_datas[i] is the i-th detection box
-    dontcares: 2D array of shape (total_dc_num, 4)
-    dontcares[i] is the i-th dontcare area
-    ignored_gts: 1D array of shape (total_gt_num)
-    ignored_gts[i] is 1 if the i-th ground truth is ignored, 0 otherwise
-    ignored_dets: 1D array of shape (total_dt_num)
-    ignored_dets[i] is 1 if the i-th detection is ignored, 0 otherwise
-    metric: Eval type. 0: bbox, 1: bev, 2: 3d
-    min_overlap: Min overlap
-    thresholds: 1D array of shape (num_thresholds)
-    thresholds[i] is the i-th threshold
-    compute_aos: Whether to compute aos
+        overlaps (np.ndarray): 2D array of shape (total_dt_num, total_gt_num),
+            [dt_num, gt_num] is the overlap between dt_num-th detection
+            and gt_num-th ground truth
+            pr (np.ndarray): 2D array of shape (num_thresholds, 4)
+            [t, 0] is the number of true positives at threshold t
+            [t, 1] is the number of false positives at threshold t
+            [t, 2] is the number of false negatives at threshold t
+            [t, 3] is the similarity at threshold t
+        gt_nums (np.ndarray): 1D array of shape (num_samples),
+            gt_nums[i] is the number of ground truths in i-th sample
+        dt_nums (np.ndarray): 1D array of shape (num_samples),
+            dt_nums[i] is the number of detections in i-th sample
+        gt_datas (np.ndarray): 2D array of shape (total_gt_num, 7),
+            gt_datas[i] is the i-th ground truth box
+        dt_datas (np.ndarray): 2D array of shape (total_dt_num, 7),
+            dt_datas[i] is the i-th detection box
+        ignored_gts (np.ndarray): 1D array of shape (total_gt_num),
+            ignored_gts[i] is 1 if the i-th ground truth is ignored, 0 otherwise
+        ignored_dets (np.ndarray): 1D array of shape (total_dt_num),
+            ignored_dets[i] is 1 if the i-th detection is ignored, 0 otherwise
+        min_overlap (float): Min overlap
+        thresholds (np.ndarray): 1D array of shape (num_thresholds),
+            thresholds[i] is the i-th threshold
     """
     gt_num = 0
     dt_num = 0
-    dc_num = 0
     for i in range(gt_nums.shape[0]):
         for t, thresh in enumerate(thresholds):
             overlap = overlaps[dt_num : dt_num + dt_nums[i], gt_num : gt_num + gt_nums[i]]
@@ -428,19 +297,15 @@ def fused_compute_statistics(
             dt_data = dt_datas[dt_num : dt_num + dt_nums[i]]
             ignored_gt = ignored_gts[gt_num : gt_num + gt_nums[i]]
             ignored_det = ignored_dets[dt_num : dt_num + dt_nums[i]]
-            dontcare = dontcares[dc_num : dc_num + dc_nums[i]]
             tp, fp, fn, similarity, _ = compute_statistics_jit(
                 overlap,
                 gt_data,
                 dt_data,
                 ignored_gt,
                 ignored_det,
-                dontcare,
-                metric,
                 min_overlap=min_overlap,
                 thresh=thresh,
                 compute_fp=True,
-                compute_aos=compute_aos,
             )
             pr[t, 0] += tp
             pr[t, 1] += fp
@@ -449,22 +314,21 @@ def fused_compute_statistics(
                 pr[t, 3] += similarity
         gt_num += gt_nums[i]
         dt_num += dt_nums[i]
-        dc_num += dc_nums[i]
 
 
 def calculate_iou_partly(
     gt_annos: list[dict[str, Any]],
     dt_annos: list[dict[str, Any]],
-    metric: int,
     num_parts: int = 50,
 ) -> tuple[list[np.ndarray], list[np.ndarray], np.ndarray, np.ndarray]:
-    """Fast iou algorithm. This function can be used independently to
-    do result analysis. Must be used in CAMERA coordinate system.
+    """Fast iou algorithm.
+
+    This function can be used independently to do result analysis.
+    Must be used in CAMERA coordinate system.
 
     Args:
         gt_annos: List of dict, must from get_label_annos() in kitti_common.py
         dt_annos: List of dict, must from get_label_annos() in kitti_common.py
-        metric: Eval type. 0: bbox, 1: bev, 2: 3d
         num_parts: Int, a parameter for fast calculate algorithm
 
     Returns:
@@ -475,12 +339,28 @@ def calculate_iou_partly(
             total_dt_num: Numpy array, shape (num_images,)
     """
 
-    def d3_box_overlap(boxes, qboxes, criterion=-1):
+    def d3_box_overlap(boxes: np.ndarray, qboxes: np.ndarray, criterion: int = -1) -> np.ndarray:
+        """Calculate 3D box overlap.
+
+        Args:
+            boxes (np.ndarray): Array of shape (n, 7) representing n 3D boxes.
+            qboxes (np.ndarray): Array of shape (k, 7) representing k 3D boxes.
+            criterion (int, optional): Overlap criterion. Defaults to -1. If -1, uses the
+                intersection-over-union (IoU) criterion. If 0, uses the
+                intersection-over-area1 criterion. If 1, uses the
+                intersection-over-area2 criterion.
+
+        Returns:
+            np.ndarray: 1D array of shape (k, )
+        """
         rinc = rotate_iou_eval(boxes[:, [0, 2, 3, 5, 6]], qboxes[:, [0, 2, 3, 5, 6]], 2)
         d3_box_overlap_kernel(boxes, qboxes, rinc, criterion)
         return rinc
 
-    assert len(gt_annos) == len(dt_annos)
+    if len(gt_annos) != len(dt_annos):
+        msg = "gt_annos and dt_annos must have same length"
+        raise ValueError(msg)
+
     total_dt_num = np.stack([len(a["name"]) for a in dt_annos], 0)
     total_gt_num = np.stack([len(a["name"]) for a in gt_annos], 0)
     num_examples = len(gt_annos)
@@ -491,22 +371,17 @@ def d3_box_overlap(boxes, qboxes, criterion=-1):
     for num_part in split_parts:
         gt_annos_part = gt_annos[example_idx : example_idx + num_part]
         dt_annos_part = dt_annos[example_idx : example_idx + num_part]
-        if metric == 0:
-            gt_boxes = np.concatenate([a["bbox"] for a in gt_annos_part], 0)
-            dt_boxes = np.concatenate([a["bbox"] for a in dt_annos_part], 0)
-            overlap_part = image_box_overlap(gt_boxes, dt_boxes)
-        elif metric == 2:
-            loc = np.concatenate([a["location"] for a in gt_annos_part], 0)
-            dims = np.concatenate([a["dimensions"] for a in gt_annos_part], 0)
-            rots = np.concatenate([a["rotation_y"] for a in gt_annos_part], 0)
-            gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1)
-            loc = np.concatenate([a["location"] for a in dt_annos_part], 0)
-            dims = np.concatenate([a["dimensions"] for a in dt_annos_part], 0)
-            rots = np.concatenate([a["rotation_y"] for a in dt_annos_part], 0)
-            dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1)
-            overlap_part = d3_box_overlap(gt_boxes, dt_boxes).astype(np.float64)
-        else:
-            raise ValueError("unknown metric")
+
+        loc = np.concatenate([a["location"] for a in gt_annos_part], 0)
+        dims = np.concatenate([a["dimensions"] for a in gt_annos_part], 0)
+        rots = np.concatenate([a["rotation_y"] for a in gt_annos_part], 0)
+        gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1)
+        loc = np.concatenate([a["location"] for a in dt_annos_part], 0)
+        dims = np.concatenate([a["dimensions"] for a in dt_annos_part], 0)
+        rots = np.concatenate([a["rotation_y"] for a in dt_annos_part], 0)
+        dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1)
+        overlap_part = d3_box_overlap(gt_boxes, dt_boxes).astype(np.float64)
+
         parted_overlaps.append(overlap_part)
         example_idx += num_part
     overlaps = []
@@ -532,36 +407,30 @@ def _prepare_data(
     gt_annos: list[dict[str, Any]],
     dt_annos: list[dict[str, Any]],
     current_class: str,
-    difficulty: int,
-) -> tuple[list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], np.ndarray, int]:
+) -> tuple[list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], int]:
     """Prepare data for evaluation.
 
     Args:
         gt_annos (List[Dict[str, Any]]): Ground truth annotations.
         dt_annos (List[Dict[str, Any]]): Detection annotations.
         current_class (str): Current class name.
-        difficulty (int): Difficulty level.
 
     Returns:
-        Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray], List[np.ndarray], List[np.ndarray], np.ndarray, int]:
-            gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt
+        Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray],
+        List[np.ndarray], int]:
+            gt_datas_list, dt_datas_list, ignored_gts, ignored_dets,
+            total_num_valid_gt
     """
     gt_datas_list = []
     dt_datas_list = []
-    total_dc_num = []
-    ignored_gts, ignored_dets, dontcares = [], [], []
+    ignored_gts, ignored_dets = [], []
     total_num_valid_gt = 0
     for i in range(len(gt_annos)):
-        rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty)
-        num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets
+        rets = clean_data(gt_annos[i], dt_annos[i], current_class)
+        num_valid_gt, ignored_gt, ignored_det = rets
         ignored_gts.append(np.array(ignored_gt, dtype=np.int64))
         ignored_dets.append(np.array(ignored_det, dtype=np.int64))
-        if len(dc_bboxes) == 0:
-            dc_bboxes = np.zeros((0, 4)).astype(np.float64)
-        else:
-            dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64)
-        total_dc_num.append(dc_bboxes.shape[0])
-        dontcares.append(dc_bboxes)
+
         total_num_valid_gt += num_valid_gt
         gt_datas = np.concatenate([gt_annos[i]["bbox"], gt_annos[i]["alpha"][..., np.newaxis]], 1)
         dt_datas = np.concatenate(
@@ -574,185 +443,134 @@ def _prepare_data(
         )
         gt_datas_list.append(gt_datas)
         dt_datas_list.append(dt_datas)
-    total_dc_num = np.stack(total_dc_num, axis=0)
-    return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt)
+
+    return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, total_num_valid_gt)
 
 
 def eval_class(
     gt_annos: list[dict[str, Any]],
     dt_annos: list[dict[str, Any]],
     current_classes: list[str],
-    difficultys: list[int],
-    metric: int,
     min_overlaps: np.ndarray,
-    compute_aos: bool = False,
     num_parts: int = 50,
+    num_samples_pts: int = 41,
 ) -> dict[str, np.ndarray]:
     """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP.
 
     Args:
-        gt_annos: dict, must from get_label_annos() in kitti_common.py
-        dt_annos: dict, must from get_label_annos() in kitti_common.py
-        current_classes: list of label names
-        difficultys: list of int. eval difficulty, 0: easy, 1: normal, 2: hard
-        metric: eval type. 0: bbox, 1: bev, 2: 3d
-        min_overlaps: float, min overlap. format: [num_overlap, metric, class].
-        num_parts: int. a parameter for fast calculate algorithm
+        gt_annos (dict): must from get_label_annos() in kitti_common.py
+        dt_annos (dict): must from get_label_annos() in kitti_common.py
+        current_classes (list): label names
+        min_overlaps (float): min overlap. format: [num_overlap, class].
+        num_parts (int): a parameter for fast calculate algorithm
+        num_samples_pts (int): number of points for precision-recall curve
 
     Returns:
-        dict of recall, precision and aos
+        dict of recall, precision
     """
-    assert len(gt_annos) == len(dt_annos)
     num_examples = len(gt_annos)
     split_parts = get_split_parts(num_examples, num_parts)
 
-    rets = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts)
+    rets = calculate_iou_partly(dt_annos, gt_annos, num_parts)
     overlaps, parted_overlaps, total_dt_num, total_gt_num = rets
-    N_SAMPLE_PTS = 41
     num_minoverlap = len(min_overlaps)
     num_class = len(current_classes)
-    num_difficulty = len(difficultys)
-    precision = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
-    recall = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
-    aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    precision = np.zeros([num_class, num_minoverlap, num_samples_pts])
+    recall = np.zeros([num_class, num_minoverlap, num_samples_pts])
     for m, current_class in enumerate(current_classes):
-        for l, difficulty in enumerate(difficultys):
-            rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty)
-            (
-                gt_datas_list,
-                dt_datas_list,
-                ignored_gts,
-                ignored_dets,
-                dontcares,
-                total_dc_num,
-                total_num_valid_gt,
-            ) = rets
-            for k, min_overlap in enumerate(min_overlaps[:, metric, m]):
-                thresholdss = []
-                for i in range(len(gt_annos)):
-                    rets = compute_statistics_jit(
-                        overlaps[i],
-                        gt_datas_list[i],
-                        dt_datas_list[i],
-                        ignored_gts[i],
-                        ignored_dets[i],
-                        dontcares[i],
-                        metric,
-                        min_overlap=min_overlap,
-                        thresh=0.0,
-                        compute_fp=False,
-                    )
-                    tp, fp, fn, similarity, thresholds = rets
-                    thresholdss += thresholds.tolist()
-                thresholdss = np.array(thresholdss)
-                thresholds = get_thresholds(thresholdss, total_num_valid_gt)
-                thresholds = np.array(thresholds)
-                pr = np.zeros([len(thresholds), 4])
-                idx = 0
-                for j, num_part in enumerate(split_parts):
-                    gt_datas_part = np.concatenate(gt_datas_list[idx : idx + num_part], 0)
-                    dt_datas_part = np.concatenate(dt_datas_list[idx : idx + num_part], 0)
-                    dc_datas_part = np.concatenate(dontcares[idx : idx + num_part], 0)
-                    ignored_dets_part = np.concatenate(ignored_dets[idx : idx + num_part], 0)
-                    ignored_gts_part = np.concatenate(ignored_gts[idx : idx + num_part], 0)
-                    fused_compute_statistics(
-                        parted_overlaps[j],
-                        pr,
-                        total_gt_num[idx : idx + num_part],
-                        total_dt_num[idx : idx + num_part],
-                        total_dc_num[idx : idx + num_part],
-                        gt_datas_part,
-                        dt_datas_part,
-                        dc_datas_part,
-                        ignored_gts_part,
-                        ignored_dets_part,
-                        metric,
-                        min_overlap=min_overlap,
-                        thresholds=thresholds,
-                        compute_aos=compute_aos,
-                    )
-                    idx += num_part
-                for i in range(len(thresholds)):
-                    recall[m, l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2])
-                    precision[m, l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 1])
-                    if compute_aos:
-                        aos[m, l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1])
-                for i in range(len(thresholds)):
-                    precision[m, l, k, i] = np.max(precision[m, l, k, i:], axis=-1)
-                    recall[m, l, k, i] = np.max(recall[m, l, k, i:], axis=-1)
-                    if compute_aos:
-                        aos[m, l, k, i] = np.max(aos[m, l, k, i:], axis=-1)
-    ret_dict = {
+        (
+            gt_datas_list,
+            dt_datas_list,
+            ignored_gts,
+            ignored_dets,
+            total_num_valid_gt,
+        ) = _prepare_data(gt_annos, dt_annos, current_class)
+        for k, min_overlap in enumerate(min_overlaps[:, m]):
+            thresholdss = []
+            for i in range(len(gt_annos)):
+                tp, fp, fn, similarity, thresholds = compute_statistics_jit(
+                    overlaps[i],
+                    gt_datas_list[i],
+                    dt_datas_list[i],
+                    ignored_gts[i],
+                    ignored_dets[i],
+                    min_overlap=min_overlap,
+                    thresh=0.0,
+                    compute_fp=False,
+                )
+                thresholdss += thresholds.tolist()
+            if not thresholdss:
+                continue  # no tp -> 0 precision and recall
+            # create thresholds between 0 and the max threshold, len(thresholds) == num_samples_pts
+            thresholds = np.linspace(0.0, np.max(thresholdss), num_samples_pts)
+            pr = np.zeros([len(thresholds), 4])
+            idx = 0
+            for j, num_part in enumerate(split_parts):
+                gt_datas_part = np.concatenate(gt_datas_list[idx : idx + num_part], 0)
+                dt_datas_part = np.concatenate(dt_datas_list[idx : idx + num_part], 0)
+                ignored_dets_part = np.concatenate(ignored_dets[idx : idx + num_part], 0)
+                ignored_gts_part = np.concatenate(ignored_gts[idx : idx + num_part], 0)
+                fused_compute_statistics(
+                    parted_overlaps[j],
+                    pr,
+                    total_gt_num[idx : idx + num_part],
+                    total_dt_num[idx : idx + num_part],
+                    gt_datas_part,
+                    dt_datas_part,
+                    ignored_gts_part,
+                    ignored_dets_part,
+                    min_overlap=min_overlap,
+                    thresholds=thresholds,
+                )
+                idx += num_part
+
+            for i in range(len(thresholds)):
+                recall[m, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2])
+                precision[m, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 1])
+
+    return {
         "recall": recall,
         "precision": precision,
-        "orientation": aos,
     }
-    return ret_dict
-
-
-def print_str(value, *arg, sstream=None):
-    if sstream is None:
-        sstream = sysio.StringIO()
-    sstream.truncate(0)
-    sstream.seek(0)
-    print(value, *arg, file=sstream)
-    return sstream.getvalue()
 
 
 def do_eval_cut_version(
-    gt_annos: list[dict[str, Any]],  # type hint
-    dt_annos: list[dict[str, Any]],  # type hint
-    current_classes: list[str],  # type hint
-    min_overlaps: np.ndarray,  # type hint
-    compute_aos: bool = False,  # type hint
-) -> tuple[float, float]:  # type hint
+    gt_annos: list[dict[str, Any]],
+    dt_annos: list[dict[str, Any]],
+    current_classes: list[str],
+    min_overlaps: np.ndarray,
+) -> np.ndarray:
     """Evaluates detections with COCO style AP.
 
     Args:
-        gt_annos (List[dict]): Ground truth annotations.
-        dt_annos (List[dict]): Detection results.
-        current_classes (List[str]): Classes to evaluate.
+        gt_annos (list[dict]): Ground truth annotations.
+        dt_annos (list[dict]): Detection results.
+        current_classes (list[str]): Classes to evaluate.
         min_overlaps (np.ndarray): Overlap ranges.
-        compute_aos (bool): Whether to compute aos.
 
     Returns:
-        Tuple[float, float]: Bounding box and 3D bounding box AP.
+        np.ndarray: 3D bounding box AP.
     """
-
-    def _get_mAP(prec: np.ndarray) -> np.ndarray:
-        sums = 0
-        for i in range(0, prec.shape[-1], 4):
-            sums = sums + prec[..., i]
-        return sums / 11 * 100
-
-    # min_overlaps: [num_minoverlap, metric, num_class]
-    difficultys = [0, 1, 2]
-    ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 0, min_overlaps, compute_aos)
-    # ret: [num_class, num_diff, num_minoverlap, num_sample_points]
-    # get 2D bbox mAP
-    mAP_bbox = _get_mAP(ret["precision"])
-
+    # min_overlaps: [num_minoverlap, num_class]
     # get 3D bbox mAP
-    ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2, min_overlaps)
-    mAP_3d = _get_mAP(ret["precision"])
-
-    return mAP_bbox, mAP_3d
+    ret = eval_class(gt_annos, dt_annos, current_classes, min_overlaps)
+    return np.mean(ret["precision"], axis=2)
 
 
 def get_coco_eval_result(
     gt_annos: list[dict],
     dt_annos: list[dict],
     current_classes: list[str],
-) -> tuple[np.ndarray, np.ndarray]:
+) -> np.ndarray:
     """Evaluates detections with COCO style AP.
 
     Args:
-        gt_annos (List[dict]): Ground truth annotations.
-        dt_annos (List[dict]): Detection results.
-        current_classes (List[str]): Classes to evaluate.
+        gt_annos (list[dict]): Ground truth annotations.
+        dt_annos (list[dict]): Detection results.
+        current_classes (list[str]): Classes to evaluate.
 
     Returns:
-        Tuple[np.ndarray, np.ndarray]: Bounding box and 3D bounding box AP.
+        np.ndarray: 3D bounding box AP.
     """
 
     def do_coco_style_eval(
@@ -760,52 +578,43 @@ def do_coco_style_eval(
         dt_annos: list[dict],
         current_classes: list[str],
         overlap_ranges: np.ndarray,
-        compute_aos: bool,
-    ) -> tuple[np.ndarray, np.ndarray]:
+    ) -> np.ndarray:
         """Evaluates detections with COCO style AP.
 
         Args:
-            gt_annos (List[dict]): Ground truth annotations.
-            dt_annos (List[dict]): Detection results.
-            current_classes (List[str]): Classes to evaluate.
+            gt_annos (list[dict]): Ground truth annotations.
+            dt_annos (list[dict]): Detection results.
+            current_classes (list[str]): Classes to evaluate.
             overlap_ranges (np.ndarray): Overlap ranges.
-            compute_aos (bool): Whether to compute aos.
 
         Returns:
-            Tuple[np.ndarray, np.ndarray]: Bounding box and 3D bounding box AP.
+            np.ndarray: 3D bounding box AP.
         """
         min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]])
 
         for i in range(overlap_ranges.shape[1]):
-            for j in range(overlap_ranges.shape[2]):
-                min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j][:2], 10)
+            min_overlaps[:, i] = np.linspace(*overlap_ranges[:, i], 10)
 
-        mAP_bbox, mAP_3d = do_eval_cut_version(gt_annos, dt_annos, current_classes, min_overlaps, compute_aos)
+        map_3d = do_eval_cut_version(gt_annos, dt_annos, current_classes, min_overlaps)
 
-        return mAP_bbox.mean(-1), mAP_3d.mean(-1)
+        result_str = ""
 
-    iou_range = [0.5, 0.95, 10]
+        for i, lbl in enumerate(current_classes):
+            result_str += f"\nclass: {lbl}\n" + "-" * len(f"class: {lbl}") + "\n"
+            for j, overlap in enumerate(min_overlaps):
+                result_str += f"AP@IoU={np.round(overlap[i],2)}: {np.round(map_3d[i][j] * 100, 2)}\n"
+            result_str += "\n"
+        logging.log(msg=result_str, level=logging.INFO)
+
+        return map_3d.mean(0)
+
+    iou_range = [0.5, 0.95]
     if not isinstance(current_classes, (list, tuple)):
         current_classes = [current_classes]
 
-    overlap_ranges = np.zeros([3, 3, len(current_classes)])
-    for i, curcls in enumerate(current_classes):
-        # IoU from 0.5 to 0.95
-        overlap_ranges[:, :, i] = np.array(iou_range)[:, np.newaxis]
-    result = ""
-    # check whether alpha is valid
-    compute_aos = False
-    mAPbbox, mAP3d = do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos)
-
-    for j, curcls in enumerate(current_classes):
-        # mAP threshold array: [num_minoverlap, metric, class]
-        # mAP result: [num_class, num_diff, num_minoverlap]
-        o_range = np.array(iou_range)[[0, 2, 1]]
-        o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1)
-        result += print_str(f"{curcls} " "coco AP@{:.2f}:{:.2f}:{:.2f}:".format(*o_range))
-        result += print_str(f"bbox AP:{mAPbbox[j, 0]:.2f}, {mAPbbox[j, 1]:.2f}, {mAPbbox[j, 2]:.2f}")
-        result += print_str(f"3d   AP:{mAP3d[j, 0]:.2f}, {mAP3d[j, 1]:.2f}, {mAP3d[j, 2]:.2f}")
-
-    print("\n COCO style evaluation results: \n", result)
-
-    return mAPbbox, mAP3d
+    overlap_ranges = np.zeros([2, len(current_classes)])
+    for i in range(len(current_classes)):
+        # iou from 0.5 to 0.95
+        overlap_ranges[:, i] = np.array(iou_range)
+
+    return do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges)
diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py
index 4ed1aa61ddc..a48325ca98c 100644
--- a/src/otx/core/model/base.py
+++ b/src/otx/core/model/base.py
@@ -124,7 +124,6 @@ def __init__(
         self.input_size = input_size
         self.classification_layers: dict[str, dict[str, Any]] = {}
         self.model = self._create_model()
-        self._explain_mode = False
         self.optimizer_callable = ensure_callable(optimizer)
         self.scheduler_callable = ensure_callable(scheduler)
         self.metric_callable = ensure_callable(metric)
@@ -1097,11 +1096,6 @@ def model_adapter_parameters(self) -> dict:
     def _set_label_info(self, label_info: LabelInfoTypes) -> None:
         """Set this model label information."""
         new_label_info = self._dispatch_label_info(label_info)
-
-        if self._label_info != new_label_info:
-            msg = "OVModel strictly does not allow overwrite label_info if they are different each other."
-            raise ValueError(msg)
-
         self._label_info = new_label_info
 
     def _create_label_info_from_ov_ir(self) -> LabelInfo:
diff --git a/src/otx/core/model/detection.py b/src/otx/core/model/detection.py
index fb6acccf12d..023b5268388 100644
--- a/src/otx/core/model/detection.py
+++ b/src/otx/core/model/detection.py
@@ -287,7 +287,7 @@ def _export_parameters(self) -> TaskLevelExportParameters:
         return super()._export_parameters.wrap(
             model_type="ssd",
             task_type="detection",
-            confidence_threshold=self.hparams.get("best_confidence_threshold", None),
+            confidence_threshold=max(0.35, self.hparams.get("best_confidence_threshold", 0.35)),
             iou_threshold=0.5,
             tile_config=self.tile_config if self.tile_config.enable_tiler else None,
         )
diff --git a/src/otx/core/model/detection_3d.py b/src/otx/core/model/detection_3d.py
index caa0d14090f..0e73c73bcfa 100644
--- a/src/otx/core/model/detection_3d.py
+++ b/src/otx/core/model/detection_3d.py
@@ -5,23 +5,25 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, NamedTuple
 
 import numpy as np
 import torch
+from model_api.models import ImageModel
 from torchvision.ops import box_convert
 
+from otx.algo.object_detection_3d.utils.utils import box_cxcylrtb_to_xyxy
 from otx.algo.utils.mmengine_utils import load_checkpoint
-from otx.core.data.dataset.utils.kitti_utils import class2angle
-from otx.core.data.entity.base import ImageInfo
+from otx.core.data.entity.base import ImageInfo, OTXBatchLossEntity
 from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DBatchPredEntity
 from otx.core.metrics import MetricInput
 from otx.core.metrics.average_precision_3d import KittiMetric
-from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable, OTXModel
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable, OTXModel, OVModel
 from otx.core.types.export import TaskLevelExportParameters
 
 if TYPE_CHECKING:
     from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
+    from model_api.adapters.inference_adapter import InferenceAdapter
     from torch import nn
 
     from otx.core.metrics import MetricCallable
@@ -73,77 +75,86 @@ def _create_model(self) -> nn.Module:
     def _export_parameters(self) -> TaskLevelExportParameters:
         """Defines parameters required to export a particular model implementation."""
         return super()._export_parameters.wrap(
-            model_type="ssd",
-            task_type="detection",
+            model_type="mono_3d_det",
+            task_type="3d_detection",
         )
 
-    def _convert_pred_entity_to_compute_metric(
+    def _customize_inputs(
         self,
-        preds: Det3DBatchPredEntity,
-        inputs: Det3DBatchDataEntity,
-    ) -> MetricInput:
-        """Converts the prediction entity to the format required for computing metrics.
-
-        Args:
-            preds (Det3DBatchPredEntity): Prediction entity.
-            inputs (Det3DBatchDataEntity): Input data entity.
-        """
-        boxes = preds.boxes_3d
-        # bbox 2d decoding
-        xywh_2d = box_convert(preds.boxes, "xyxy", "cxcywh")
-
-        xs3d = boxes[:, :, 0:1]
-        ys3d = boxes[:, :, 1:2]
-        xs2d = xywh_2d[:, :, 0:1]
-        ys2d = xywh_2d[:, :, 1:2]
-
-        batch = len(boxes)
-        labels = preds.labels.view(batch, -1, 1)
-        scores = preds.scores.view(batch, -1, 1)
-        xs2d = xs2d.view(batch, -1, 1)
-        ys2d = ys2d.view(batch, -1, 1)
-        xs3d = xs3d.view(batch, -1, 1)
-        ys3d = ys3d.view(batch, -1, 1)
-
-        detections = (
-            torch.cat(
-                [
-                    labels,
-                    scores,
-                    xs2d,
-                    ys2d,
-                    preds.size_2d,
-                    preds.depth[:, :, 0:1],
-                    preds.heading_angle,
-                    preds.size_3d,
-                    xs3d,
-                    ys3d,
-                    torch.exp(-preds.depth[:, :, 1:2]),
-                ],
-                dim=2,
-            )
-            .detach()
-            .cpu()
-            .numpy()
-        )
-
-        img_sizes = np.array([img_info.ori_shape for img_info in inputs.imgs_info])
-        calib_matrix = [p2.detach().cpu().numpy() for p2 in inputs.calib_matrix]
-        result_list = self._decode_detections_for_kitti_format(
-            detections,
-            img_sizes,
-            calib_matrix,
-            class_names=self.label_info.label_names,
-            threshold=self.score_threshold,
+        entity: Det3DBatchDataEntity,
+    ) -> dict[str, Any]:
+        # prepare bboxes for the model
+        targets_list = []
+        img_sizes = torch.from_numpy(np.array([img_info.ori_shape for img_info in entity.imgs_info])).to(
+            device=entity.images.device,
         )
+        key_list = ["labels", "boxes", "depth", "size_3d", "heading_angle", "boxes_3d"]
+        for bz in range(len(entity.imgs_info)):
+            target_dict = {}
+            for key in key_list:
+                target_dict[key] = getattr(entity, key)[bz]
+            targets_list.append(target_dict)
 
         return {
-            "preds": result_list,
-            "target": inputs.original_kitti_format,  # type: ignore[dict-item]
+            "images": entity.images,
+            "calibs": torch.cat([p2.unsqueeze(0) for p2 in entity.calib_matrix], dim=0),
+            "targets": targets_list,
+            "img_sizes": img_sizes,
+            "mode": "loss" if self.training else "predict",
         }
 
+    def _customize_outputs(
+        self,
+        outputs: dict[str, torch.Tensor],
+        inputs: Det3DBatchDataEntity,
+    ) -> Det3DBatchPredEntity | OTXBatchLossEntity:
+        if self.training:
+            if not isinstance(outputs, dict):
+                raise TypeError(outputs)
+
+            losses = OTXBatchLossEntity()
+            for k, v in outputs.items():
+                if isinstance(v, list):
+                    losses[k] = sum(v)
+                elif isinstance(v, torch.Tensor):
+                    losses[k] = v
+                else:
+                    msg = "Loss output should be list or torch.tensor but got {type(v)}"
+                    raise TypeError(msg)
+            return losses
+
+        labels, scores, size_3d, heading_angle, boxes_3d, depth = self.extract_dets_from_outputs(outputs)
+        # bbox 2d decoding
+        boxes_2d = box_cxcylrtb_to_xyxy(boxes_3d)
+        xywh_2d = box_convert(boxes_2d, "xyxy", "cxcywh")
+        # size 2d decoding
+        size_2d = xywh_2d[:, :, 2:4]
+
+        return Det3DBatchPredEntity(
+            batch_size=inputs.batch_size,
+            images=inputs.images,
+            imgs_info=inputs.imgs_info,
+            calib_matrix=inputs.calib_matrix,
+            boxes=boxes_2d,
+            labels=labels,
+            boxes_3d=boxes_3d,
+            size_2d=size_2d,
+            size_3d=size_3d,
+            depth=depth,
+            heading_angle=heading_angle,
+            scores=scores,
+            original_kitti_format=[None],
+        )
+
+    def _convert_pred_entity_to_compute_metric(
+        self,
+        preds: Det3DBatchPredEntity,
+        inputs: Det3DBatchDataEntity,
+    ) -> MetricInput:
+        return _convert_pred_entity_to_compute_metric(preds, inputs, self.label_info.label_names, self.score_threshold)
+
     @staticmethod
-    def _decode_detections_for_kitti_format(
+    def decode_detections_for_kitti_format(
         dets: np.ndarray,
         img_size: np.ndarray,
         calib_matrix: list[np.ndarray],
@@ -153,7 +164,34 @@ def _decode_detections_for_kitti_format(
         """Decode the detection results for KITTI format."""
 
         def _get_heading_angle(heading: np.ndarray) -> np.ndarray:
-            """Get heading angle from the prediction."""
+            """Get heading angle from the prediction.
+
+            Args:
+                heading (np.ndarray): The heading prediction.
+
+            Returns:
+                np.ndarray: The heading angle in label format.
+            """
+
+            def class2angle(cls: int, residual: float, to_label_format: bool = False) -> float:
+                """Inverse function to angle2class.
+
+                Args:
+                    cls (int): The class index.
+                    residual (float): The residual angle.
+                    to_label_format (bool): Whether to return the angle in label format.
+
+                Returns:
+                    float: The angle in label format.
+                """
+                num_heading_bin = 12
+                angle_per_class = 2 * np.pi / float(num_heading_bin)
+                angle_center = cls * angle_per_class
+                angle = angle_center + residual
+                if to_label_format and angle > np.pi:
+                    angle = angle - 2 * np.pi
+                return angle
+
             heading_bin, heading_res = heading[0:12], heading[12:24]
             cls = np.argmax(heading_bin)
             res = heading_res[cls]
@@ -203,10 +241,10 @@ def _img_to_rect(calib_matrix: np.ndarray, u: np.ndarray, v: np.ndarray, depth_r
                     continue
 
                 # 2d bboxs decoding
-                x = dets[i, j, 2] * img_size[i][0]
-                y = dets[i, j, 3] * img_size[i][1]
-                w = dets[i, j, 4] * img_size[i][0]
-                h = dets[i, j, 5] * img_size[i][1]
+                x = dets[i, j, 2] * img_size[i][1]
+                y = dets[i, j, 3] * img_size[i][0]
+                w = dets[i, j, 4] * img_size[i][1]
+                h = dets[i, j, 5] * img_size[i][0]
                 bbox = [x - w / 2, y - h / 2, x + w / 2, y + h / 2]
 
                 # 3d bboxs decoding
@@ -217,8 +255,8 @@ def _img_to_rect(calib_matrix: np.ndarray, u: np.ndarray, v: np.ndarray, depth_r
                 dimension = dets[i, j, 31:34]
 
                 # positions decoding
-                x3d = dets[i, j, 34] * img_size[i][0]
-                y3d = dets[i, j, 35] * img_size[i][1]
+                x3d = dets[i, j, 34] * img_size[i][1]
+                y3d = dets[i, j, 35] * img_size[i][0]
                 location = _img_to_rect(calib_matrix[i], x3d, y3d, depth).reshape(-1)
                 location[1] += dimension[0] / 2
 
@@ -255,31 +293,7 @@ def get_dummy_input(self, batch_size: int = 1) -> Det3DBatchDataEntity:
             msg = f"Input size attribute is not set for {self.__class__}"
             raise ValueError(msg)
 
-        images = [torch.rand(3, *self.input_size) for _ in range(batch_size)]
-        calib_matrix = [torch.rand(3, 4) for _ in range(batch_size)]
-        infos = []
-        for i, img in enumerate(images):
-            infos.append(
-                ImageInfo(
-                    img_idx=i,
-                    img_shape=img.shape,
-                    ori_shape=img.shape,
-                ),
-            )
-        return Det3DBatchDataEntity(
-            batch_size,
-            images,
-            infos,
-            boxes=[],
-            labels=[],
-            calib_matrix=calib_matrix,
-            boxes_3d=[],
-            size_2d=[],
-            size_3d=[],
-            depth=[],
-            heading_angle=[],
-            original_kitti_format=[],
-        )
+        return _generate_dummy_input(self.input_size, batch_size)
 
     def get_classification_layers(self, prefix: str = "model.") -> dict[str, dict[str, int]]:
         """Get final classification layer information for incremental learning case."""
@@ -295,3 +309,347 @@ def get_classification_layers(self, prefix: str = "model.") -> dict[str, dict[st
                 num_extra_classes = 6 * sample_model_dim - 5 * incremental_model_dim
                 classification_layers[prefix + key] = {"stride": stride, "num_extra_classes": num_extra_classes}
         return classification_layers
+
+
+class MonoDETRModel(ImageModel):
+    """A wrapper for MonoDETR 3d object detection model."""
+
+    __model__ = "mono_3d_det"
+
+    def __init__(self, inference_adapter: InferenceAdapter, configuration: dict[str, Any], preload: bool = False):
+        """Initializes a 3d detection model.
+
+        Args:
+            inference_adapter (InferenceAdapter): inference adapter containing the underlying model.
+            configuration (dict, optional): configuration overrides the model parameters (see parameters() method).
+            preload (bool, optional): forces inference adapter to load the model. Defaults to False.
+        """
+        super().__init__(inference_adapter, configuration, preload)
+        self._check_io_number(3, 5)
+
+    def preprocess(self, inputs: dict[str, np.ndarray]) -> tuple[dict[str, Any], ...]:
+        """Preprocesses the input data for the model.
+
+        Args:
+            inputs (dict[str, np.ndarray]): a dict with image, calibration matrix, and image size
+
+        Returns:
+            tuple[dict[str, Any], ...]: a tuple with the preprocessed inputs and meta information
+        """
+        return {
+            self.image_blob_name: inputs["image"][None],
+            "calib_matrix": inputs["calib"],
+            "img_sizes": inputs["img_size"][None],
+        }, {
+            "original_shape": inputs["image"].shape,
+            "resized_shape": (self.h, self.w, self.c),
+        }
+
+    def _get_inputs(self) -> tuple[list[Any], list[Any]]:
+        """Defines the model inputs for images and additional info.
+
+        Raises:
+            WrapperError: if the wrapper failed to define appropriate inputs for images
+
+        Returns:
+            - list of inputs names for images
+            - list of inputs names for additional info
+        """
+        image_blob_names, image_info_blob_names = [], []
+        for name, metadata in self.inputs.items():
+            if len(metadata.shape) == 4:
+                image_blob_names.append(name)
+            elif len(metadata.shape) == 2:
+                image_info_blob_names.append(name)
+
+        if not image_blob_names:
+            self.raise_error(
+                "Failed to identify the input for the image: no 4D input layer found",
+            )
+        return image_blob_names, image_info_blob_names
+
+    def postprocess(
+        self,
+        outputs: dict[str, np.ndarray],
+        meta: dict[str, Any],
+    ) -> dict[str, Any]:
+        """Applies SCC decoded to the model outputs.
+
+        Args:
+            outputs (dict[str, np.ndarray]): raw outputs of the model
+            meta (dict[str, Any]): meta information about the input data
+
+        Returns:
+            dict[str, Any]: postprocessed model outputs
+        """
+        result = {}
+        for k in outputs:
+            result[k] = np.copy(outputs[k])
+
+        return result
+
+
+class OV3DDetectionModel(OVModel[Det3DBatchDataEntity, Det3DBatchPredEntity]):
+    """3d detection model compatible for OpenVINO IR inference.
+
+    It can consume OpenVINO IR model path or model name from Intel OMZ repository
+    and create the OTX 3d detection model compatible for OTX testing pipeline.
+    """
+
+    def __init__(
+        self,
+        model_name: str,
+        model_type: str = "mono_3d_det",
+        async_inference: bool = True,
+        max_num_requests: int | None = None,
+        use_throughput_mode: bool = True,
+        model_api_configuration: dict[str, Any] | None = None,
+        metric: MetricCallable = KittiMetric,
+        score_threshold: float = 0.2,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            model_name=model_name,
+            model_type=model_type,
+            async_inference=async_inference,
+            max_num_requests=max_num_requests,
+            use_throughput_mode=use_throughput_mode,
+            model_api_configuration=model_api_configuration,
+            metric=metric,
+        )
+        self.score_threshold = score_threshold
+
+    def _customize_inputs(
+        self,
+        entity: Det3DBatchDataEntity,
+    ) -> dict[str, Any]:
+        img_sizes = np.array([img_info.ori_shape for img_info in entity.imgs_info])
+        images = [np.transpose(im.cpu().numpy(), (1, 2, 0)) for im in entity.images]
+
+        return {
+            "images": images,
+            "calibs": [p2.unsqueeze(0).cpu().numpy() for p2 in entity.calib_matrix],
+            "targets": [],
+            "img_sizes": img_sizes,
+            "mode": "predict",
+        }
+
+    def _customize_outputs(
+        self,
+        outputs: list[NamedTuple],
+        inputs: Det3DBatchDataEntity,
+    ) -> Det3DBatchPredEntity | OTXBatchLossEntity:
+        stacked_outputs: dict[str, Any] = {}
+
+        for output in outputs:
+            for k in output:
+                if k in stacked_outputs:
+                    stacked_outputs[k] = torch.cat((stacked_outputs[k], torch.tensor(output[k])), 0)
+                else:
+                    stacked_outputs[k] = torch.tensor(output[k])
+
+        labels, scores, size_3d, heading_angle, boxes_3d, depth = self.extract_dets_from_outputs(stacked_outputs)
+        # bbox 2d decoding
+        boxes_2d = box_cxcylrtb_to_xyxy(boxes_3d)
+        xywh_2d = box_convert(boxes_2d, "xyxy", "cxcywh")
+        # size 2d decoding
+        size_2d = xywh_2d[:, :, 2:4]
+
+        return Det3DBatchPredEntity(
+            batch_size=len(outputs),
+            images=inputs.images,
+            imgs_info=inputs.imgs_info,
+            calib_matrix=inputs.calib_matrix,
+            boxes=boxes_2d,
+            labels=labels,
+            boxes_3d=boxes_3d,
+            size_2d=size_2d,
+            size_3d=size_3d,
+            depth=depth,
+            heading_angle=heading_angle,
+            scores=scores,
+            original_kitti_format=[None],
+        )
+
+    def _forward(self, inputs: Det3DBatchDataEntity) -> Det3DBatchPredEntity:
+        """Model forward function."""
+        all_inputs = self._customize_inputs(inputs)
+
+        model_ready_inputs = []
+        for image, calib, img_size in zip(all_inputs["images"], all_inputs["calibs"], all_inputs["img_sizes"]):
+            model_ready_inputs.append(
+                {
+                    "image": image,
+                    "calib": calib,
+                    "img_size": img_size,
+                },
+            )
+
+        if self.async_inference:
+            outputs = self.model.infer_batch(model_ready_inputs)
+        else:
+            outputs = []
+            for model_input in model_ready_inputs:
+                outputs.append(self.model(model_input))
+
+        customized_outputs = self._customize_outputs(outputs, inputs)
+
+        if isinstance(customized_outputs, OTXBatchLossEntity):
+            raise TypeError(customized_outputs)
+
+        return customized_outputs
+
+    def transform_fn(self, data_batch: Det3DBatchDataEntity) -> dict:
+        """Data transform function for PTQ."""
+        all_inputs = self._customize_inputs(data_batch)
+        image = all_inputs["images"][0]
+        model = self.model
+        resized_image = model.resize(image, (model.w, model.h))
+        resized_image = model.input_transform(resized_image)
+
+        return {
+            "images": model._change_layout(resized_image),  # noqa: SLF001,
+            "calib_matrix": all_inputs["calibs"][0],
+            "img_sizes": all_inputs["img_sizes"][0][None],
+        }
+
+    @staticmethod
+    def extract_dets_from_outputs(outputs: dict[str, torch.Tensor], topk: int = 50) -> tuple[torch.Tensor, ...]:
+        """Extract detection results from model outputs."""
+        # b, q, c
+        out_logits = outputs["scores"]
+        out_bbox = outputs["boxes_3d"]
+
+        prob = out_logits
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), topk, dim=1)
+
+        # final scores
+        scores = topk_values
+        # final indexes
+        topk_boxes = (topk_indexes // out_logits.shape[2]).unsqueeze(-1)
+        # final labels
+        labels = topk_indexes % out_logits.shape[2]
+
+        heading = outputs["heading_angle"]
+        size_3d = outputs["size_3d"]
+        depth = outputs["depth"]
+        # decode boxes
+        boxes_3d = torch.gather(out_bbox, 1, topk_boxes.repeat(1, 1, 6))  # b, q', 4
+        # heading angle decoding
+        heading = torch.gather(heading, 1, topk_boxes.repeat(1, 1, 24))
+        # depth decoding
+        depth = torch.gather(depth, 1, topk_boxes.repeat(1, 1, 2))
+        # 3d dims decoding
+        size_3d = torch.gather(size_3d, 1, topk_boxes.repeat(1, 1, 3))
+
+        return labels, scores, size_3d, heading, boxes_3d, depth
+
+    def _convert_pred_entity_to_compute_metric(
+        self,
+        preds: Det3DBatchPredEntity,
+        inputs: Det3DBatchDataEntity,
+    ) -> MetricInput:
+        return _convert_pred_entity_to_compute_metric(preds, inputs, self.label_info.label_names, self.score_threshold)
+
+    def get_dummy_input(self, batch_size: int = 1) -> Det3DBatchDataEntity:
+        """Returns a dummy input for 3d object detection model."""
+        return _generate_dummy_input((224, 224), batch_size)
+
+
+def _convert_pred_entity_to_compute_metric(
+    preds: Det3DBatchPredEntity,
+    inputs: Det3DBatchDataEntity,
+    label_names: list[str],
+    score_threshold: float,
+) -> MetricInput:
+    """Converts the prediction entity to the format required for computing metrics.
+
+    Args:
+        preds (Det3DBatchPredEntity): Prediction entity.
+        inputs (Det3DBatchDataEntity): Input data entity.
+        label_names (list[str]): List of label names.
+        score_threshold (float): Score threshold for filtering the predictions.
+    """
+    boxes = preds.boxes_3d
+    # bbox 2d decoding
+    xywh_2d = box_convert(preds.boxes, "xyxy", "cxcywh")
+
+    xs3d = boxes[:, :, 0:1]
+    ys3d = boxes[:, :, 1:2]
+    xs2d = xywh_2d[:, :, 0:1]
+    ys2d = xywh_2d[:, :, 1:2]
+
+    batch = len(boxes)
+    labels = preds.labels.view(batch, -1, 1)
+    scores = preds.scores.view(batch, -1, 1)
+    xs2d = xs2d.view(batch, -1, 1)
+    ys2d = ys2d.view(batch, -1, 1)
+    xs3d = xs3d.view(batch, -1, 1)
+    ys3d = ys3d.view(batch, -1, 1)
+
+    detections = (
+        torch.cat(
+            [
+                labels,
+                scores,
+                xs2d,
+                ys2d,
+                preds.size_2d,
+                preds.depth[:, :, 0:1],
+                preds.heading_angle,
+                preds.size_3d,
+                xs3d,
+                ys3d,
+                torch.exp(-preds.depth[:, :, 1:2]),
+            ],
+            dim=2,
+        )
+        .detach()
+        .cpu()
+        .numpy()
+    )
+
+    img_sizes = np.array([img_info.ori_shape for img_info in inputs.imgs_info])
+    calib_matrix = [p2.detach().cpu().numpy() for p2 in inputs.calib_matrix]
+    result_list = OTX3DDetectionModel.decode_detections_for_kitti_format(
+        detections,
+        img_sizes,
+        calib_matrix,
+        class_names=label_names,
+        threshold=score_threshold,
+    )
+
+    return {
+        "preds": result_list,
+        "target": inputs.original_kitti_format,  # type: ignore[dict-item]
+    }
+
+
+def _generate_dummy_input(input_size: tuple[int, ...], batch_size: int = 1) -> Det3DBatchDataEntity:
+    """Returns a dummy input for 3d object detection model."""
+    images = torch.rand(batch_size, 3, *input_size)
+    calib_matrix = [torch.rand(3, 4) for _ in range(batch_size)]
+    infos = []
+    for i, img in enumerate(images):
+        infos.append(
+            ImageInfo(
+                img_idx=i,
+                img_shape=img.shape[1:],
+                ori_shape=img.shape[1:],
+            ),
+        )
+
+    return Det3DBatchDataEntity(
+        batch_size,
+        images,
+        infos,
+        boxes=[torch.Tensor(0)] * batch_size,
+        labels=[torch.LongTensor(0)] * batch_size,
+        calib_matrix=calib_matrix,
+        boxes_3d=[torch.LongTensor(0)] * batch_size,
+        size_2d=[],
+        size_3d=[torch.LongTensor(0)] * batch_size,
+        depth=[torch.LongTensor(0)] * batch_size,
+        heading_angle=[torch.LongTensor(0)] * batch_size,
+        original_kitti_format=[],
+    )
diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py
index 9595b218231..a22cc15fbc4 100644
--- a/src/otx/core/model/segmentation.py
+++ b/src/otx/core/model/segmentation.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import copy
 import json
 import logging as log
 from abc import abstractmethod
@@ -126,7 +127,12 @@ def _build_model(self) -> nn.Module:
         """
 
     def _customize_inputs(self, entity: SegBatchDataEntity) -> dict[str, Any]:
-        mode = "loss" if self.training else "predict"
+        if self.training:
+            mode = "loss"
+        elif self.explain_mode:
+            mode = "explain"
+        else:
+            mode = "predict"
 
         if self.train_type == OTXTrainType.SEMI_SUPERVISED and mode == "loss":
             if not isinstance(entity, dict):
@@ -162,6 +168,16 @@ def _customize_outputs(
                 losses[k] = v
             return losses
 
+        if self.explain_mode:
+            return SegBatchPredEntity(
+                batch_size=len(outputs["preds"]),
+                images=inputs.images,
+                imgs_info=inputs.imgs_info,
+                scores=[],
+                masks=outputs["preds"],
+                feature_vector=outputs["feature_vector"],
+            )
+
         return SegBatchPredEntity(
             batch_size=len(outputs),
             images=inputs.images,
@@ -173,12 +189,20 @@ def _customize_outputs(
     @property
     def _export_parameters(self) -> TaskLevelExportParameters:
         """Defines parameters required to export a particular model implementation."""
+        if self.label_info.label_names[0] == "otx_background_lbl":
+            # remove otx background label for export
+            modified_label_info = copy.deepcopy(self.label_info)
+            modified_label_info.label_names.pop(0)
+        else:
+            modified_label_info = self.label_info
+
         return super()._export_parameters.wrap(
             model_type="Segmentation",
             task_type="segmentation",
             return_soft_prediction=True,
             soft_threshold=0.5,
             blur_strength=-1,
+            label_info=modified_label_info,
             tile_config=self.tile_config if self.tile_config.enable_tiler else None,
         )
 
@@ -199,7 +223,7 @@ def _exporter(self) -> OTXModelExporter:
             swap_rgb=False,
             via_onnx=False,
             onnx_export_configuration=None,
-            output_names=None,
+            output_names=["preds", "feature_vector"] if self.explain_mode else None,
         )
 
     def _convert_pred_entity_to_compute_metric(
@@ -207,6 +231,16 @@ def _convert_pred_entity_to_compute_metric(
         preds: SegBatchPredEntity,
         inputs: SegBatchDataEntity,
     ) -> MetricInput:
+        """Convert prediction and input entities to a format suitable for metric computation.
+
+        Args:
+            preds (SegBatchPredEntity): The predicted segmentation batch entity containing predicted masks.
+            inputs (SegBatchDataEntity): The input segmentation batch entity containing ground truth masks.
+
+        Returns:
+            MetricInput: A list of dictionaries where each dictionary contains 'preds' and 'target' keys
+            corresponding to the predicted and target masks for metric evaluation.
+        """
         return [
             {
                 "preds": pred_mask,
@@ -280,8 +314,26 @@ def forward_tiles(self, inputs: OTXTileBatchDataEntity[SegBatchDataEntity]) -> S
 
     def forward_for_tracing(self, image: Tensor) -> Tensor | dict[str, Tensor]:
         """Model forward function used for the model tracing during model exportation."""
-        raw_outputs = self.model(inputs=image, mode="tensor")
-        return torch.softmax(raw_outputs, dim=1)
+        if self.explain_mode:
+            outputs = self.model(inputs=image, mode="explain")
+            outputs["preds"] = torch.softmax(outputs["preds"], dim=1)
+            return outputs
+
+        outputs = self.model(inputs=image, mode="tensor")
+        return torch.softmax(outputs, dim=1)
+
+    def forward_explain(self, inputs: SegBatchDataEntity) -> SegBatchPredEntity:
+        """Model forward explain function."""
+        outputs = self.model(inputs=inputs.images, mode="explain")
+
+        return SegBatchPredEntity(
+            batch_size=len(outputs["preds"]),
+            images=inputs.images,
+            imgs_info=inputs.imgs_info,
+            scores=[],
+            masks=outputs["preds"],
+            feature_vector=outputs["feature_vector"],
+        )
 
     def get_dummy_input(self, batch_size: int = 1) -> SegBatchDataEntity:
         """Returns a dummy input for semantic segmentation model."""
@@ -371,25 +423,17 @@ def _customize_outputs(
         outputs: list[ImageResultWithSoftPrediction],
         inputs: SegBatchDataEntity,
     ) -> SegBatchPredEntity | OTXBatchLossEntity:
-        if outputs and outputs[0].saliency_map.size != 1:
-            predicted_s_maps = [out.saliency_map for out in outputs]
-            predicted_f_vectors = [out.feature_vector for out in outputs]
-            return SegBatchPredEntity(
-                batch_size=len(outputs),
-                images=inputs.images,
-                imgs_info=inputs.imgs_info,
-                scores=[],
-                masks=[tv_tensors.Mask(mask.resultImage, device=self.device) for mask in outputs],
-                saliency_map=predicted_s_maps,
-                feature_vector=predicted_f_vectors,
-            )
-
+        masks = [tv_tensors.Mask(mask.resultImage, device=self.device) for mask in outputs]
+        predicted_f_vectors = (
+            [out.feature_vector for out in outputs] if outputs and outputs[0].feature_vector.size != 1 else []
+        )
         return SegBatchPredEntity(
             batch_size=len(outputs),
             images=inputs.images,
             imgs_info=inputs.imgs_info,
             scores=[],
-            masks=[tv_tensors.Mask(mask.resultImage, device=self.device) for mask in outputs],
+            masks=masks,
+            feature_vector=predicted_f_vectors,
         )
 
     def _convert_pred_entity_to_compute_metric(
@@ -397,6 +441,16 @@ def _convert_pred_entity_to_compute_metric(
         preds: SegBatchPredEntity,
         inputs: SegBatchDataEntity,
     ) -> MetricInput:
+        """Convert prediction and input entities to a format suitable for metric computation.
+
+        Args:
+            preds (SegBatchPredEntity): The predicted segmentation batch entity containing predicted masks.
+            inputs (SegBatchDataEntity): The input segmentation batch entity containing ground truth masks.
+
+        Returns:
+            MetricInput: A list of dictionaries where each dictionary contains 'preds' and 'target' keys
+            corresponding to the predicted and target masks for metric evaluation.
+        """
         return [
             {
                 "preds": pred_mask,
diff --git a/src/otx/core/types/label.py b/src/otx/core/types/label.py
index 7f00aa0b496..c89f67d7fd6 100644
--- a/src/otx/core/types/label.py
+++ b/src/otx/core/types/label.py
@@ -169,10 +169,8 @@ def from_dm_label_groups(cls, dm_label_categories: LabelCategories) -> HLabelInf
             dm_label_categories (LabelCategories): the label categories of datumaro.
         """
 
-        def get_exclusive_group_info(all_groups: list[Label | list[Label]]) -> dict[str, Any]:
+        def get_exclusive_group_info(exclusive_groups: list[Label | list[Label]]) -> dict[str, Any]:
             """Get exclusive group information."""
-            exclusive_groups = [g for g in all_groups if len(g) > 1]
-
             last_logits_pos = 0
             num_single_label_classes = 0
             head_idx_to_logits_range = {}
@@ -193,12 +191,10 @@ def get_exclusive_group_info(all_groups: list[Label | list[Label]]) -> dict[str,
             }
 
         def get_single_label_group_info(
-            all_groups: list[Label | list[Label]],
+            single_label_groups: list[Label | list[Label]],
             num_exclusive_groups: int,
         ) -> dict[str, Any]:
             """Get single label group information."""
-            single_label_groups = [g for g in all_groups if len(g) == 1]
-
             class_to_idx = {}
 
             for i, group in enumerate(single_label_groups):
@@ -256,24 +252,31 @@ def convert_labels_if_needed(
         label_names = [item.name for item in dm_label_categories.items]
         all_groups = convert_labels_if_needed(dm_label_categories, label_names)
 
-        exclusive_group_info = get_exclusive_group_info(all_groups)
-        single_label_group_info = get_single_label_group_info(all_groups, exclusive_group_info["num_multiclass_heads"])
+        exclusive_groups = [g for g in all_groups if len(g) > 1]
+        exclusive_group_info = get_exclusive_group_info(exclusive_groups)
+        single_label_groups = [g for g in all_groups if len(g) == 1]
+        single_label_group_info = get_single_label_group_info(
+            single_label_groups,
+            exclusive_group_info["num_multiclass_heads"],
+        )
 
         merged_class_to_idx = merge_class_to_idx(
             exclusive_group_info["class_to_idx"],
             single_label_group_info["class_to_idx"],
         )
 
+        label_to_idx = {lbl: i for i, lbl in enumerate(merged_class_to_idx.keys())}
+
         return HLabelInfo(
             label_names=label_names,
-            label_groups=all_groups,
+            label_groups=exclusive_groups + single_label_groups,
             num_multiclass_heads=exclusive_group_info["num_multiclass_heads"],
             num_multilabel_classes=single_label_group_info["num_multilabel_classes"],
             head_idx_to_logits_range=exclusive_group_info["head_idx_to_logits_range"],
             num_single_label_classes=exclusive_group_info["num_single_label_classes"],
             class_to_group_idx=merged_class_to_idx,
-            all_groups=all_groups,
-            label_to_idx=dm_label_categories._indices,  # noqa: SLF001
+            all_groups=exclusive_groups + single_label_groups,
+            label_to_idx=label_to_idx,
             label_tree_edges=get_label_tree_edges(dm_label_categories.items),
             empty_multiclass_head_indices=[],  # consider the label removing case
         )
diff --git a/src/otx/engine/adaptive_bs/bs_search_algo.py b/src/otx/engine/adaptive_bs/bs_search_algo.py
index a029d10aa6d..c0400eff284 100644
--- a/src/otx/engine/adaptive_bs/bs_search_algo.py
+++ b/src/otx/engine/adaptive_bs/bs_search_algo.py
@@ -112,8 +112,14 @@ def auto_decrease_batch_size(self) -> int:
                 break
 
         if available_bs == 0:
-            msg = "Current device can't train model even with 2."
-            raise RuntimeError(msg)
+            if oom:
+                msg = "Current device can't train model even with 2."
+                raise RuntimeError(msg)
+            logger.warning(
+                "Even with a batch size of 2, most of the memory is used, "
+                "which could cause the training to fail midway.",
+            )
+            available_bs = 2
 
         return available_bs
 
@@ -141,8 +147,14 @@ def find_big_enough_batch_size(self, drop_last: bool = False) -> int:
         if oom or bs_mem_usage > self._mem_upper_bound:
             self._default_bs -= 2
             if self._default_bs <= 0:
-                msg = "Current device can't train model even with 2."
-                raise RuntimeError(msg)
+                if oom:
+                    msg = "Current device can't train model even with 2."
+                    raise RuntimeError(msg)
+                logger.warning(
+                    "Even with a batch size of 2, most of the memory is used, "
+                    "which could cause the training to fail midway.",
+                )
+                return 2
 
             return self.auto_decrease_batch_size()
 
diff --git a/src/otx/engine/engine.py b/src/otx/engine/engine.py
index 47647caf7d6..8803870f3a4 100644
--- a/src/otx/engine/engine.py
+++ b/src/otx/engine/engine.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import copy
 import csv
 import inspect
 import logging
@@ -366,18 +367,32 @@ def test(
         # NOTE, trainer.test takes only lightning based checkpoint.
         # So, it can't take the OTX1.x checkpoint.
         if checkpoint is not None and not is_ir_ckpt:
+            kwargs_user_input: dict[str, Any] = {}
+            if self.task == OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING:
+                # to update user's custom infer_reference_info_root through cli for zero-shot learning
+                # TODO (sungchul): revisit for better solution
+                kwargs_user_input.update(infer_reference_info_root=self.model.infer_reference_info_root)
+
             model_cls = model.__class__
-            model = model_cls.load_from_checkpoint(checkpoint_path=checkpoint, **model.hparams)
+            model = model_cls.load_from_checkpoint(checkpoint_path=checkpoint, **kwargs_user_input)
 
         if model.label_info != self.datamodule.label_info:
-            msg = (
-                "To launch a test pipeline, the label information should be same "
-                "between the training and testing datasets. "
-                "Please check whether you use the same dataset: "
-                f"model.label_info={model.label_info}, "
-                f"datamodule.label_info={self.datamodule.label_info}"
-            )
-            raise ValueError(msg)
+            if (
+                self.task == "SEMANTIC_SEGMENTATION"
+                and "otx_background_lbl" in self.datamodule.label_info.label_names
+                and (len(self.datamodule.label_info.label_names) - len(model.label_info.label_names) == 1)
+            ):
+                # workaround for background label
+                model.label_info = copy.deepcopy(self.datamodule.label_info)
+            else:
+                msg = (
+                    "To launch a test pipeline, the label information should be same "
+                    "between the training and testing datasets. "
+                    "Please check whether you use the same dataset: "
+                    f"model.label_info={model.label_info}, "
+                    f"datamodule.label_info={self.datamodule.label_info}"
+                )
+                raise ValueError(msg)
 
         self._build_trainer(**kwargs)
 
@@ -453,8 +468,14 @@ def predict(
             datamodule = self._auto_configurator.update_ov_subset_pipeline(datamodule=datamodule, subset="test")
 
         if checkpoint is not None and not is_ir_ckpt:
+            kwargs_user_input: dict[str, Any] = {}
+            if self.task == OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING:
+                # to update user's custom infer_reference_info_root through cli for zero-shot learning
+                # TODO (sungchul): revisit for better solution
+                kwargs_user_input.update(infer_reference_info_root=self.model.infer_reference_info_root)
+
             model_cls = model.__class__
-            model = model_cls.load_from_checkpoint(checkpoint_path=checkpoint, **model.hparams)
+            model = model_cls.load_from_checkpoint(checkpoint_path=checkpoint, **kwargs_user_input)
 
         if model.label_info != self.datamodule.label_info:
             msg = (
@@ -565,11 +586,17 @@ def export(
             )
 
         if not is_ir_ckpt:
+            kwargs_user_input: dict[str, Any] = {}
+            if self.task == OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING:
+                # to update user's custom infer_reference_info_root through cli for zero-shot learning
+                # TODO (sungchul): revisit for better solution
+                kwargs_user_input.update(infer_reference_info_root=self.model.infer_reference_info_root)
+
             model_cls = self.model.__class__
             self.model = model_cls.load_from_checkpoint(
                 checkpoint_path=checkpoint,
                 map_location="cpu",
-                **self.model.hparams,
+                **kwargs_user_input,
             )
             self.model.eval()
 
@@ -733,8 +760,14 @@ def explain(
             model = self._auto_configurator.get_ov_model(model_name=str(checkpoint), label_info=datamodule.label_info)
 
         if checkpoint is not None and not is_ir_ckpt:
+            kwargs_user_input: dict[str, Any] = {}
+            if self.task == OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING:
+                # to update user's custom infer_reference_info_root through cli for zero-shot learning
+                # TODO (sungchul): revisit for better solution
+                kwargs_user_input.update(infer_reference_info_root=self.model.infer_reference_info_root)
+
             model_cls = model.__class__
-            model = model_cls.load_from_checkpoint(checkpoint_path=checkpoint, **model.hparams)
+            model = model_cls.load_from_checkpoint(checkpoint_path=checkpoint, **kwargs_user_input)
 
         if model.label_info != self.datamodule.label_info:
             msg = (
@@ -836,11 +869,17 @@ def benchmark(
                 )
 
             if not is_ir_ckpt:
+                kwargs_user_input: dict[str, Any] = {}
+                if self.task == OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING:
+                    # to update user's custom infer_reference_info_root through cli for zero-shot learning
+                    # TODO (sungchul): revisit for better solution
+                    kwargs_user_input.update(infer_reference_info_root=self.model.infer_reference_info_root)
+
                 model_cls = self.model.__class__
                 self.model = model_cls.load_from_checkpoint(
                     checkpoint_path=checkpoint,
                     map_location="cpu",
-                    **self.model.hparams,
+                    **kwargs_user_input,
                 )
         elif isinstance(self.model, OVModel):
             msg = "To run benchmark on OV model, checkpoint must be specified."
@@ -874,7 +913,7 @@ def dummy_infer(model: OTXModel, batch_size: int = 1) -> float:
                 input_batch = self.model.get_dummy_input(1)
                 model_fwd = lambda: self.model.forward(input_batch)
                 depth = 3 if extended_stats else 0
-                fwd_flops = measure_flops(self.model.model, model_fwd, print_stats_depth=depth)
+                fwd_flops = measure_flops(model_fwd, print_stats_depth=depth)
                 flops_str = convert_num_with_suffix(fwd_flops, get_suffix_str(fwd_flops * 10**3))
                 final_stats["complexity"] = flops_str + " MACs"
             except Exception as e:
diff --git a/src/otx/engine/hpo/hpo_api.py b/src/otx/engine/hpo/hpo_api.py
index 9f470944266..9025cfd0fdc 100644
--- a/src/otx/engine/hpo/hpo_api.py
+++ b/src/otx/engine/hpo/hpo_api.py
@@ -9,6 +9,7 @@
 import json
 import logging
 import time
+from copy import copy
 from functools import partial
 from pathlib import Path
 from threading import Thread
@@ -16,6 +17,7 @@
 
 import torch
 import yaml
+from lightning import Callback
 
 from otx.core.config.hpo import HpoConfig
 from otx.core.optimizer.callable import OptimizerCallableSupportHPO
@@ -35,7 +37,6 @@
 from .utils import find_trial_file, get_best_hpo_weight, get_callable_args_name, get_hpo_weight_dir, get_metric
 
 if TYPE_CHECKING:
-    from lightning import Callback
     from lightning.pytorch.cli import OptimizerCallable
 
     from otx.engine.engine import Engine
@@ -48,7 +49,6 @@ def execute_hpo(
     engine: Engine,
     max_epochs: int,
     hpo_config: HpoConfig,
-    progress_update_callback: Callable[[int | float], None] | None = None,
     callbacks: list[Callback] | Callback | None = None,
     **train_args,
 ) -> tuple[dict[str, Any] | None, Path | None]:
@@ -58,8 +58,6 @@ def execute_hpo(
         engine (Engine): engine instnace.
         max_epochs (int): max epochs to train.
         hpo_config (HpoConfig): Configuration for HPO.
-        progress_update_callback (Callable[[int | float], None] | None, optional):
-            callback to update progress. If it's given, it's called with progress every second. Defaults to None.
         callbacks (list[Callback] | Callback | None, optional): callbacks used during training. Defaults to None.
 
     Returns:
@@ -97,8 +95,23 @@ def execute_hpo(
         logger.warning("HPO is skipped.")
         return None, None
 
-    if progress_update_callback is not None:
-        Thread(target=_update_hpo_progress, args=[progress_update_callback, hpo_algo], daemon=True).start()
+    if hpo_config.progress_update_callback is not None:
+        Thread(target=_update_hpo_progress, args=[hpo_config.progress_update_callback, hpo_algo], daemon=True).start()
+
+    if hpo_config.callbacks_to_exclude is not None and callbacks is not None:
+        if isinstance(hpo_config.callbacks_to_exclude, str):
+            hpo_config.callbacks_to_exclude = [hpo_config.callbacks_to_exclude]
+        if isinstance(callbacks, Callback):
+            callbacks = [callbacks]
+
+        callbacks = copy(callbacks)
+        callback_names = [callback.__class__.__name__ for callback in callbacks]
+        callback_idx_to_exclude = [
+            callback_names.index(cb_name) for cb_name in hpo_config.callbacks_to_exclude if cb_name in callback_names
+        ]
+        sorted(callback_idx_to_exclude, reverse=True)
+        for idx in callback_idx_to_exclude:
+            callbacks.pop(idx)
 
     run_hpo_loop(
         hpo_algo,
diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py
index 6207d33f342..f875fb6b9b7 100644
--- a/src/otx/engine/utils/auto_configurator.py
+++ b/src/otx/engine/utils/auto_configurator.py
@@ -94,6 +94,7 @@
     OTXTaskType.ANOMALY_DETECTION: "otx.algo.anomaly.openvino_model.AnomalyOpenVINO",
     OTXTaskType.ANOMALY_SEGMENTATION: "otx.algo.anomaly.openvino_model.AnomalyOpenVINO",
     OTXTaskType.KEYPOINT_DETECTION: "otx.core.model.keypoint_detection.OVKeypointDetectionModel",
+    OTXTaskType.OBJECT_DETECTION_3D: "otx.core.model.detection_3d.OV3DDetectionModel",
 }
 
 
diff --git a/src/otx/recipe/_base_/data/anomaly.yaml b/src/otx/recipe/_base_/data/anomaly.yaml
index 2f74b987915..dd3a4f244c6 100644
--- a/src/otx/recipe/_base_/data/anomaly.yaml
+++ b/src/otx/recipe/_base_/data/anomaly.yaml
@@ -1,5 +1,5 @@
 task: ANOMALY_CLASSIFICATION
-input_size: 256
+input_size: [256, 256]
 data_format: mvtec
 mem_cache_size: 1GB
 mem_cache_img_max_size: null
@@ -13,11 +13,10 @@ train_subset:
   batch_size: 32
   num_workers: 4
   transforms:
-    - class_path: otx.core.data.transform_libs.torchvision.ResizetoLongestEdge
+    - class_path: torchvision.transforms.v2.Resize
       init_args:
-        size: $(input_size)
+        size: [256, 256]
         antialias: true
-    - class_path: otx.core.data.transform_libs.torchvision.PadtoSquare
     - class_path: torchvision.transforms.v2.ToDtype
       init_args:
         dtype: ${as_torch_dtype:torch.float32}
@@ -36,11 +35,10 @@ val_subset:
   batch_size: 32
   num_workers: 4
   transforms:
-    - class_path: otx.core.data.transform_libs.torchvision.ResizetoLongestEdge
+    - class_path: torchvision.transforms.v2.Resize
       init_args:
-        size: $(input_size)
+        size: [256, 256]
         antialias: true
-    - class_path: otx.core.data.transform_libs.torchvision.PadtoSquare
     - class_path: torchvision.transforms.v2.ToDtype
       init_args:
         dtype: ${as_torch_dtype:torch.float32}
@@ -59,11 +57,10 @@ test_subset:
   batch_size: 32
   num_workers: 4
   transforms:
-    - class_path: otx.core.data.transform_libs.torchvision.ResizetoLongestEdge
+    - class_path: torchvision.transforms.v2.Resize
       init_args:
-        size: $(input_size)
+        size: [256, 256]
         antialias: true
-    - class_path: otx.core.data.transform_libs.torchvision.PadtoSquare
     - class_path: torchvision.transforms.v2.ToDtype
       init_args:
         dtype: ${as_torch_dtype:torch.float32}
diff --git a/src/otx/recipe/_base_/data/object_detection_3d.yaml b/src/otx/recipe/_base_/data/object_detection_3d.yaml
index a7c773f1bcf..90b0527ada5 100644
--- a/src/otx/recipe/_base_/data/object_detection_3d.yaml
+++ b/src/otx/recipe/_base_/data/object_detection_3d.yaml
@@ -12,9 +12,20 @@ train_subset:
   subset_name: train
   transform_lib_type: TORCHVISION
   batch_size: 8
-  num_workers: 4
+  num_workers: 2
   to_tv_image: false
   transforms:
+    - class_path: otx.core.data.transform_libs.torchvision.Decode3DInputsAffineTransforms
+      init_args:
+        input_size: $(input_size)
+        random_horizontal_flip: true
+        random_crop: true
+        p_crop: 0.5
+        random_scale: 0.05
+        random_shift: 0.05
+    - class_path: torchvision.transforms.v2.ToDtype
+      init_args:
+        dtype: ${as_torch_dtype:torch.float32}
     - class_path: torchvision.transforms.v2.Normalize
       init_args:
         mean: [123.675, 116.28, 103.53]
@@ -27,9 +38,16 @@ val_subset:
   subset_name: val
   transform_lib_type: TORCHVISION
   batch_size: 16
-  num_workers: 4
+  num_workers: 2
   to_tv_image: false
   transforms:
+    - class_path: otx.core.data.transform_libs.torchvision.Decode3DInputsAffineTransforms
+      init_args:
+        input_size: $(input_size)
+        decode_annotations: false
+    - class_path: torchvision.transforms.v2.ToDtype
+      init_args:
+        dtype: ${as_torch_dtype:torch.float32}
     - class_path: torchvision.transforms.v2.Normalize
       init_args:
         mean: [123.675, 116.28, 103.53]
@@ -41,9 +59,16 @@ test_subset:
   subset_name: test
   transform_lib_type: TORCHVISION
   batch_size: 16
-  num_workers: 4
+  num_workers: 2
   to_tv_image: false
   transforms:
+    - class_path: otx.core.data.transform_libs.torchvision.Decode3DInputsAffineTransforms
+      init_args:
+        input_size: $(input_size)
+        decode_annotations: false
+    - class_path: torchvision.transforms.v2.ToDtype
+      init_args:
+        dtype: ${as_torch_dtype:torch.float32}
     - class_path: torchvision.transforms.v2.Normalize
       init_args:
         mean: [123.675, 116.28, 103.53]
diff --git a/src/otx/recipe/object_detection_3d/monodetr3d.yaml b/src/otx/recipe/object_detection_3d/monodetr3d.yaml
index 032c71ffbf8..ec5aaa005eb 100644
--- a/src/otx/recipe/object_detection_3d/monodetr3d.yaml
+++ b/src/otx/recipe/object_detection_3d/monodetr3d.yaml
@@ -20,13 +20,13 @@ model:
         mode: max
         factor: 0.1
         patience: 13
-        monitor: val/mAP_bbox_2d
+        monitor: val/AP_2d@0.5
 
 engine:
   task: OBJECT_DETECTION_3D
   device: auto
 
-callback_monitor: val/mAP_bbox_3d
+callback_monitor: val/AP_3d@0.5
 
 data: ../_base_/data/object_detection_3d.yaml
 
diff --git a/src/otx/recipe/object_detection_3d/openvino_model.yaml b/src/otx/recipe/object_detection_3d/openvino_model.yaml
new file mode 100644
index 00000000000..62265f06f6e
--- /dev/null
+++ b/src/otx/recipe/object_detection_3d/openvino_model.yaml
@@ -0,0 +1,43 @@
+model:
+  class_path: otx.core.model.detection_3d.OV3DDetectionModel
+  init_args:
+    label_info: 3
+    model_name: monodetr-001
+    model_type: "mono_3d_det"
+    async_inference: true
+    use_throughput_mode: true
+
+engine:
+  task: OBJECT_DETECTION_3D
+  device: cpu
+
+callback_monitor: val/mAP_bbox_2d
+
+data: ../_base_/data/object_detection_3d.yaml
+overrides:
+  reset:
+    - data.train_subset.transforms
+    - data.val_subset.transforms
+    - data.test_subset.transforms
+
+  data:
+    stack_images: false
+    train_subset:
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Decode3DInputsAffineTransforms
+          init_args:
+            decode_annotations: false
+
+    val_subset:
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Decode3DInputsAffineTransforms
+          init_args:
+            decode_annotations: false
+
+    test_subset:
+      to_tv_image: false
+      batch_size: 64
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Decode3DInputsAffineTransforms
+          init_args:
+            decode_annotations: false
diff --git a/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b.yaml b/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b.yaml
index 1909a5434a6..81cd040685a 100644
--- a/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b.yaml
+++ b/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b.yaml
@@ -1,7 +1,7 @@
 model:
   class_path: otx.core.model.rotated_detection.RotatedMaskRCNNModel
   init_args:
-    model_name: efficientnet_b2b
+    model_name: maskrcnn_efficientnet_b2b
     label_info: 80
 
     optimizer:
diff --git a/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b_tile.yaml b/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b_tile.yaml
index 90b5914df79..9a950613d6b 100644
--- a/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b_tile.yaml
+++ b/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b_tile.yaml
@@ -1,7 +1,7 @@
 model:
   class_path: otx.core.model.rotated_detection.RotatedMaskRCNNModel
   init_args:
-    model_name: efficientnet_b2b
+    model_name: maskrcnn_efficientnet_b2b
     label_info: 80
 
     optimizer:
diff --git a/src/otx/tools/converter.py b/src/otx/tools/converter.py
index cb74298e910..e476a35e1e4 100644
--- a/src/otx/tools/converter.py
+++ b/src/otx/tools/converter.py
@@ -156,7 +156,6 @@
         "task": OTXTaskType.SEMANTIC_SEGMENTATION,
         "model_name": "dino_v2",
     },
-    # ANOMALY_CLASSIFICATION
     # ANOMALY
     "ote_anomaly_padim": {
         "task": OTXTaskType.ANOMALY,
@@ -193,6 +192,15 @@
         "task": OTXTaskType.ANOMALY_SEGMENTATION,
         "model_name": "stfpm",
     },
+    # KEYPOINT_DETECTION
+    "Custom_Keypoint_Detection_Rtmpose_T": {
+        "task": OTXTaskType.KEYPOINT_DETECTION,
+        "model_name": "rtmpose_tiny",
+    },
+    "Custom_Keypoint_Detection_Rtmpose_T_Single_Obj": {
+        "task": OTXTaskType.KEYPOINT_DETECTION,
+        "model_name": "rtmpose_tiny_single_obj",
+    },
 }
 
 
diff --git a/src/otx/tools/templates/segmentation/ocr_lite_hrnet_18/template.yaml b/src/otx/tools/templates/segmentation/ocr_lite_hrnet_18/template.yaml
new file mode 100644
index 00000000000..f0e296c6ce0
--- /dev/null
+++ b/src/otx/tools/templates/segmentation/ocr_lite_hrnet_18/template.yaml
@@ -0,0 +1,46 @@
+# Description.
+model_template_id: Custom_Semantic_Segmentation_Lite-HRNet-18_OCR
+name: Lite-HRNet-18
+task_type: SEGMENTATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Semantic Segmentation with middle-sized architecture which based on the Lite-HRNet backbone for the balance between the fast inference and long training. (OBSOLETE, please use Lite-HRNet-18-mod2 instead)
+application: ~
+
+# Algo backend.
+framework: OTXSegmentation v0.14.0
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+      learning_rate:
+        default_value: 0.001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 300
+    algo_backend:
+      train_type:
+        default_value: Incremental
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 3.45
+size: 4.5
+
+# Model spec
+model_status: OBSOLETE
diff --git a/src/otx/utils/utils.py b/src/otx/utils/utils.py
index e084eb7bd95..c1e735201e7 100644
--- a/src/otx/utils/utils.py
+++ b/src/otx/utils/utils.py
@@ -263,7 +263,6 @@ def check_pickleable(obj: Any) -> bool:  # noqa: ANN401
 
 
 def measure_flops(
-    model: torch.nn.Module,
     forward_fn: Callable[[], torch.Tensor],
     loss_fn: Callable[[torch.Tensor], torch.Tensor] | None = None,
     print_stats_depth: int = 0,
@@ -271,7 +270,7 @@ def measure_flops(
     """Utility to compute the total number of FLOPs used by a module during training or during inference."""
     from torch.utils.flop_counter import FlopCounterMode
 
-    flop_counter = FlopCounterMode(model, display=print_stats_depth > 0, depth=print_stats_depth)
+    flop_counter = FlopCounterMode(display=print_stats_depth > 0, depth=print_stats_depth)
     with flop_counter:
         if loss_fn is None:
             forward_fn()
diff --git a/tests/e2e/cli/test_cli.py b/tests/e2e/cli/test_cli.py
index 19424f722de..3078784a8fd 100644
--- a/tests/e2e/cli/test_cli.py
+++ b/tests/e2e/cli/test_cli.py
@@ -52,6 +52,8 @@ def test_otx_e2e_cli(
     if task == OTXTaskType.INSTANCE_SEGMENTATION:
         is_tiling = "tile" in recipe
         dataset_path = fxt_target_dataset_per_task[task]["tiling" if is_tiling else "non_tiling"]
+    elif task == OTXTaskType.KEYPOINT_DETECTION:
+        dataset_path = fxt_target_dataset_per_task[task][model_name]
     else:
         dataset_path = fxt_target_dataset_per_task[task]
 
@@ -138,7 +140,7 @@ def test_otx_e2e_cli(
             ExportCase2Test("ONNX", False, "exported_model_decoder.onnx"),
             ExportCase2Test("OPENVINO", False, "exported_model_decoder.xml"),
         ]
-    elif "ANOMALY" in task or OTXTaskType.KEYPOINT_DETECTION in task:
+    elif task in ("ANOMALY", OTXTaskType.KEYPOINT_DETECTION, OTXTaskType.OBJECT_DETECTION_3D):
         fxt_export_list = [
             ExportCase2Test("ONNX", False, "exported_model.onnx"),
             ExportCase2Test("OPENVINO", False, "exported_model.xml"),
@@ -178,6 +180,9 @@ def test_otx_e2e_cli(
         assert latest_dir.exists()
         assert (latest_dir / export_case.expected_output).exists()
 
+    if task == OTXTaskType.OBJECT_DETECTION_3D:
+        return  # "3D Object Detection is not supported for OV IR inference.
+
     # 4) infer of the exported models
     ov_output_dir = tmp_path_test / "outputs" / "OPENVINO"
     ov_files = list(ov_output_dir.rglob("exported*.xml"))
@@ -220,8 +225,8 @@ def test_otx_e2e_cli(
     # 5) otx export with XAI
     if "instance_segmentation/rtmdet_inst_tiny" in recipe:
         return
-    if ("_cls" not in task) and (task not in ["detection", "instance_segmentation"]):
-        return  # Supported only for classification, detection and instance segmentation task.
+    if ("_cls" not in task) and (task not in ["detection", "instance_segmentation", "semantic_segmentation"]):
+        return  # Supported only for classification, detection and segmentation tasks.
 
     unsupported_models = ["dino", "rtdetr"]
     if any(model in model_name for model in unsupported_models):
@@ -302,6 +307,24 @@ def test_otx_explain_e2e_cli(
     ]:
         pytest.skip("Supported only for classification, detection and instance segmentation task.")
 
+    models_not_supported = [
+        "dino",
+        "yolov9_s",
+        "yolov9_c",
+        "rtdetr_18",
+        "rtdetr_18_tile",
+        "rtdetr_50_tile",
+        "yolov9_m",
+        "rtdetr_101_tile",
+        "rtdetr_50",
+        "rtdetr_101",
+        "maskrcnn_r50_tv",
+        "maskrcnn_r50_tv_tile",
+    ]
+
+    if any(model in model_name for model in models_not_supported):
+        pytest.skip(f"{model_name} is not supported.")
+
     deterministic = "True"
     if task == OTXTaskType.INSTANCE_SEGMENTATION:
         # Determinism is not required for this test for instance_segmentation models.
@@ -314,9 +337,6 @@ def test_otx_explain_e2e_cli(
     if isinstance(dataset_path, dict) and "supervised" in dataset_path:
         dataset_path = dataset_path["supervised"]
 
-    if "dino" in model_name:
-        pytest.skip("DINO is not supported.")
-
     # otx explain
     tmp_path_explain = tmp_path / f"otx_explain_{model_name}"
     command_cfg = [
@@ -431,6 +451,8 @@ def test_otx_hpo_e2e_cli(
 
     if task == OTXTaskType.INSTANCE_SEGMENTATION:
         dataset_path = fxt_target_dataset_per_task[task]["non_tiling"]
+    elif task == OTXTaskType.KEYPOINT_DETECTION:
+        dataset_path = fxt_target_dataset_per_task[task]["rtmpose_tiny"]
     else:
         dataset_path = fxt_target_dataset_per_task[task]
 
diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py
index 0dced369ee4..e3778734388 100644
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -99,7 +99,7 @@ def fxt_target_dataset_per_task(fxt_ci_data_root) -> dict:
         OTXTaskType.MULTI_LABEL_CLS: Path(fxt_ci_data_root / "v2/multilabel_classification/multilabel_CUB_small/1"),
         OTXTaskType.H_LABEL_CLS: Path(fxt_ci_data_root / "v2/hlabel_classification/hlabel_CUB_small/1"),
         OTXTaskType.DETECTION: Path(fxt_ci_data_root / "v2/detection/bdd_small/1"),
-        OTXTaskType.ROTATED_DETECTION: Path(fxt_ci_data_root / "v2/rotated_detection/subway"),
+        OTXTaskType.ROTATED_DETECTION: Path(fxt_ci_data_root / "v2/rotated_detection/sample"),
         OTXTaskType.INSTANCE_SEGMENTATION: {
             "non_tiling": Path(fxt_ci_data_root / "v2/instance_seg/wgisd_small/1"),
             "tiling": Path(fxt_ci_data_root / "v2/tiling_instance_seg/vitens_aeromonas_small/1"),
@@ -109,7 +109,7 @@ def fxt_target_dataset_per_task(fxt_ci_data_root) -> dict:
             "unlabeled": Path(fxt_ci_data_root / "v2/semantic_seg/semi-sl/unlabeled_images/kvasir"),
         },
         OTXTaskType.ACTION_CLASSIFICATION: Path(
-            fxt_ci_data_root / "v2/action/action_classification/ucf_kinetics_5percent_small",
+            fxt_ci_data_root / "v2/action/action_classification/ucf_kinetics_30percent_medium",
         ),
         OTXTaskType.VISUAL_PROMPTING: Path(fxt_ci_data_root / "v2/visual_prompting/coco_car_person_medium"),
         OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING: Path(
@@ -119,7 +119,11 @@ def fxt_target_dataset_per_task(fxt_ci_data_root) -> dict:
         OTXTaskType.ANOMALY_CLASSIFICATION: Path(fxt_ci_data_root / "v2/anomaly/mvtec/hazelnut_large"),
         OTXTaskType.ANOMALY_DETECTION: Path(fxt_ci_data_root / "v2/anomaly/mvtec/hazelnut_large"),
         OTXTaskType.ANOMALY_SEGMENTATION: Path(fxt_ci_data_root / "v2/anomaly/mvtec/hazelnut_large"),
-        OTXTaskType.KEYPOINT_DETECTION: Path(fxt_ci_data_root / "v2/keypoint_detection/coco_keypoint_medium"),
+        OTXTaskType.KEYPOINT_DETECTION: {
+            "rtmpose_tiny": Path(fxt_ci_data_root / "v2/keypoint_detection/coco_keypoint/medium"),
+            "rtmpose_tiny_single_obj": Path(fxt_ci_data_root / "v2/keypoint_detection/coco_keypoint_single_obj/medium"),
+        },
+        OTXTaskType.OBJECT_DETECTION_3D: Path(fxt_ci_data_root / "v2/object_detection_3d/medium_pedestrian_cyclist"),
     }
 
 
@@ -141,4 +145,5 @@ def fxt_cli_override_command_per_task() -> dict:
         OTXTaskType.ANOMALY_DETECTION: [],
         OTXTaskType.ANOMALY_SEGMENTATION: [],
         OTXTaskType.KEYPOINT_DETECTION: [],
+        OTXTaskType.OBJECT_DETECTION_3D: [],
     }
diff --git a/tests/integration/cli/test_cli.py b/tests/integration/cli/test_cli.py
index f571dc2ed2c..fd7a9438aea 100644
--- a/tests/integration/cli/test_cli.py
+++ b/tests/integration/cli/test_cli.py
@@ -159,6 +159,10 @@ def test_otx_e2e(
             ExportCase2Test("OPENVINO", False, "exported_model_decoder.xml"),
         ]  # TODO (sungchul): EXPORTABLE_CODE will be supported
 
+    if task == "object_detection_3d":
+        # exportable code and demo package are not supported for OD 3D
+        fxt_export_list.pop(-1)
+
     overrides = fxt_cli_override_command_per_task[task]
 
     tmp_path_test = tmp_path / f"otx_test_{model_name}"
@@ -191,14 +195,6 @@ def test_otx_e2e(
         assert latest_dir.exists()
         assert (latest_dir / export_case.expected_output).exists()
 
-    if "keypoint" in recipe:
-        print("Inference and explain are not supported for keypoint detection")
-        return
-
-    if "monodetr3d" in recipe:
-        print("Inference and explain are not supported for object detection 3d")
-        return
-
     # 4) infer of the exported models
     ov_output_dir = tmp_path_test / "outputs" / "OPENVINO"
     ov_files = list(ov_output_dir.rglob("exported*.xml"))
@@ -250,8 +246,8 @@ def test_otx_e2e(
     # 5) otx export with XAI
     if "instance_segmentation/rtmdet_inst_tiny" in recipe:
         return
-    if ("_cls" not in task) and (task not in ["detection", "instance_segmentation"]):
-        return  # Supported only for classification, detection and instance segmentation task.
+    if ("_cls" not in task) and (task not in ["detection", "instance_segmentation", "semantic_segmentation"]):
+        return  # Supported only for classification, detection and segmentation tasks.
 
     if "dino" in model_name:
         return  # DINO is not supported.
@@ -261,10 +257,15 @@ def test_otx_e2e(
 
     if "yolov9" in model_name:
         return  # RT-DETR currently is not supported.
+
     if "keypoint" in recipe:
         print("Explain is not supported for keypoint detection")
         return
 
+    if "monodetr3d" in recipe:
+        print("Explain is not supported for object detection 3d")
+        return
+
     tmp_path_test = tmp_path / f"otx_export_xai_{model_name}"
     for export_case in fxt_export_list:
         command_cfg = [
diff --git a/tests/integration/cli/test_export_inference.py b/tests/integration/cli/test_export_inference.py
index f39547ca81a..1d455616c4f 100644
--- a/tests/integration/cli/test_export_inference.py
+++ b/tests/integration/cli/test_export_inference.py
@@ -49,6 +49,7 @@ def fxt_local_seed() -> int:
     "zero_shot_visual_prompting": "test/f1-score",
     "action_classification": "test/accuracy",
     "keypoint_detection": "test/PCK",
+    "object_detection_3d": "test/AP_3d@0.5",
 }
 
 
diff --git a/tests/perf/test_object_detection_3d.py b/tests/perf/test_object_detection_3d.py
index 74a5bb43ca9..2fae45c8221 100644
--- a/tests/perf/test_object_detection_3d.py
+++ b/tests/perf/test_object_detection_3d.py
@@ -40,10 +40,14 @@ class TestPerfObjectDetection3D(PerfTestBase):
     BENCHMARK_CRITERIA = [  # noqa: RUF012
         Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1),
         Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1),
-        Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1),
-        Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="val/AP_3d@0.5", summary="max", compare=">", margin=0.05),
+        Benchmark.Criterion(name="val/AP_2d@0.5", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test/AP_3d@0.5", summary="max", compare=">", margin=0.05),
+        Benchmark.Criterion(name="test/AP_2d@0.5", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="export/AP_3d@0.5", summary="max", compare=">", margin=0.05),
+        Benchmark.Criterion(name="export/AP_2d@0.5", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="optimize/AP_3d@0.5", summary="max", compare=">", margin=0.05),
+        Benchmark.Criterion(name="optimize/AP_2d@0.5", summary="max", compare=">", margin=0.1),
         Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
diff --git a/src/otx/core/data/dataset/utils/__init__.py b/tests/unit/algo/object_detection_3d/__init__.py
similarity index 64%
rename from src/otx/core/data/dataset/utils/__init__.py
rename to tests/unit/algo/object_detection_3d/__init__.py
index 0c75fd7a904..189d63933b4 100644
--- a/src/otx/core/data/dataset/utils/__init__.py
+++ b/tests/unit/algo/object_detection_3d/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
-"""Module defines utils for OTXDatasets."""
+"""Test of OTX Object Detection 3D task."""
diff --git a/tests/unit/algo/object_detection_3d/backbones/__init__.py b/tests/unit/algo/object_detection_3d/backbones/__init__.py
new file mode 100644
index 00000000000..a9de1fff0dc
--- /dev/null
+++ b/tests/unit/algo/object_detection_3d/backbones/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Test of OTX Object Detection 3D backbones."""
diff --git a/tests/unit/algo/object_detection_3d/backbones/test_monodetr_resnet.py b/tests/unit/algo/object_detection_3d/backbones/test_monodetr_resnet.py
new file mode 100644
index 00000000000..39975fe3bf9
--- /dev/null
+++ b/tests/unit/algo/object_detection_3d/backbones/test_monodetr_resnet.py
@@ -0,0 +1,78 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Tests for MonoDetr backbone."""
+import pytest
+import torch
+from otx.algo.object_detection_3d.backbones.monodetr_resnet import BackboneBase, Joiner, PositionEmbeddingSine
+from otx.algo.object_detection_3d.utils.utils import NestedTensor
+
+
+class TestBackbone:
+    @pytest.fixture()
+    def backbone(self, mocker):
+        mocker.patch("otx.algo.object_detection_3d.backbones.monodetr_resnet.IntermediateLayerGetter")
+        model = BackboneBase(backbone=mocker.MagicMock(torch.nn.Module), train_backbone=True, return_interm_layers=True)
+        model.body = mocker.MagicMock(return_value={"layer_0": torch.rand(1, 3, 256, 224)})
+        return model
+
+    def test_backbone_forward(self, backbone):
+        images = torch.randn(1, 3, 224, 224)
+        output = backbone(images)
+        assert isinstance(output, dict)
+        assert len(output) == 1
+        assert all(isinstance(value, NestedTensor) for value in output.values())
+
+    def test_position_embedding_sine(self):
+        # Create a PositionEmbeddingSine instance
+        position_embedding = PositionEmbeddingSine(num_pos_feats=128, temperature=10000, normalize=False, scale=None)
+
+        # Create a dummy input tensor
+        tensor_list = torch.randn(1, 512, 48, 160)
+        nested_tensor = NestedTensor(tensor_list, mask=torch.ones(1, 48, 160).bool())
+
+        # Forward pass
+        output = position_embedding(nested_tensor)
+
+        # Check output shape
+        assert output.shape == (1, 256, 48, 160)
+        # Check output type
+        assert output.dtype == torch.float32
+        # Check sine and cosine properties
+        assert torch.allclose(
+            output[:, :, :, :80].sin().pow(2) + output[:, :, :, 80:].cos().pow(2),
+            torch.ones(1, 256, 48, 80),
+        )
+
+
+class TestJoiner:
+    @pytest.fixture()
+    def joiner(self, mocker):
+        mocker.patch("otx.algo.object_detection_3d.backbones.monodetr_resnet.Backbone")
+        mocker.patch("otx.algo.object_detection_3d.backbones.monodetr_resnet.PositionEmbeddingSine")
+        backbone = mocker.MagicMock(torch.nn.Module)
+        backbone.strides = [4, 8, 16]
+        backbone.num_channels = [32, 64, 128]
+        position_embedding = mocker.MagicMock(torch.nn.Module)
+        return Joiner(backbone=backbone, position_embedding=position_embedding)
+
+    def test_joiner_forward(self, joiner):
+        images = torch.randn(1, 3, 224, 224)
+        nested_tensors = [NestedTensor(torch.randn(1, 256, 56, 56), torch.ones(1, 56, 56).bool())]
+        position_embeddings = [torch.randn(1, 256, 56, 56)]
+        joiner[0].return_value = {0: nested_tensors[0]}
+        joiner[1].return_value = position_embeddings[0]
+
+        output_tensors, output_position_embeddings = joiner(images)
+
+        assert isinstance(output_tensors, list)
+        assert isinstance(output_position_embeddings, list)
+        assert len(output_tensors) == 1
+        assert len(output_position_embeddings) == 1
+        assert isinstance(output_tensors[0], NestedTensor)
+        assert isinstance(output_position_embeddings[0], torch.Tensor)
+        assert output_tensors[0].tensors.shape == (1, 256, 56, 56)
+        assert output_tensors[0].mask.shape == (1, 56, 56)
+        assert output_position_embeddings[0].shape == (1, 256, 56, 56)
+        assert output_tensors[0].tensors.dtype == images.dtype
+        assert output_position_embeddings[0].dtype == images.dtype
diff --git a/tests/unit/algo/object_detection_3d/conftest.py b/tests/unit/algo/object_detection_3d/conftest.py
new file mode 100644
index 00000000000..81e3326bfae
--- /dev/null
+++ b/tests/unit/algo/object_detection_3d/conftest.py
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Test of custom algo modules of OTX Object Detection 3D task."""
+
+import pytest
+import torch
+from otx.core.config.data import SubsetConfig
+from otx.core.data.module import OTXDataModule
+from otx.core.data.transform_libs.torchvision import Decode3DInputsAffineTransforms
+from otx.core.types.task import OTXTaskType
+from torchvision.transforms.v2 import Normalize, ToDtype
+
+
+@pytest.fixture()
+def fxt_data_module_3d():
+    return OTXDataModule(
+        task=OTXTaskType.OBJECT_DETECTION_3D,
+        data_format="kitti3d",
+        data_root="tests/assets/kitti3d",
+        train_subset=SubsetConfig(
+            batch_size=2,
+            subset_name="train",
+            transforms=[
+                Decode3DInputsAffineTransforms((380, 1280), True),
+                ToDtype(torch.float),
+                Normalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),
+            ],
+            to_tv_image=False,
+        ),
+        val_subset=SubsetConfig(
+            batch_size=2,
+            subset_name="val",
+            transforms=[
+                Decode3DInputsAffineTransforms((380, 1280), decode_annotations=False),
+                ToDtype(torch.float),
+                Normalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),
+            ],
+            to_tv_image=False,
+        ),
+        test_subset=SubsetConfig(
+            batch_size=2,
+            subset_name="test",
+            transforms=[
+                Decode3DInputsAffineTransforms((380, 1280), decode_annotations=False),
+                ToDtype(torch.float),
+                Normalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),
+            ],
+            to_tv_image=False,
+        ),
+    )
diff --git a/tests/unit/algo/object_detection_3d/detectors/__init__.py b/tests/unit/algo/object_detection_3d/detectors/__init__.py
new file mode 100644
index 00000000000..fe121ff5cc0
--- /dev/null
+++ b/tests/unit/algo/object_detection_3d/detectors/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Test of OTX Object Detection 3D detectors."""
diff --git a/tests/unit/algo/object_detection_3d/detectors/test_monodetr.py b/tests/unit/algo/object_detection_3d/detectors/test_monodetr.py
new file mode 100644
index 00000000000..4dd27c1bca3
--- /dev/null
+++ b/tests/unit/algo/object_detection_3d/detectors/test_monodetr.py
@@ -0,0 +1,71 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Test MonoDetr."""
+
+import pytest
+import torch
+from otx.algo.object_detection_3d.backbones.monodetr_resnet import BackboneBuilder
+from otx.algo.object_detection_3d.detectors.monodetr import MonoDETR
+from otx.algo.object_detection_3d.heads.depth_predictor import DepthPredictor
+from otx.algo.object_detection_3d.heads.depthaware_transformer import DepthAwareTransformerBuilder
+
+
+class TestMonoDETR:
+    @pytest.fixture()
+    def model(self):
+        backbone = BackboneBuilder("monodetr_50")
+        # transformer
+        depthaware_transformer = DepthAwareTransformerBuilder("monodetr_50")
+        # depth prediction module
+        depth_predictor = DepthPredictor(depth_num_bins=80, depth_min=1e-3, depth_max=60.0, hidden_dim=256)
+
+        num_classes = 2
+        num_queries = 50
+        num_feature_levels = 4
+        return MonoDETR(
+            backbone,
+            depthaware_transformer,
+            depth_predictor,
+            num_classes=num_classes,
+            num_queries=num_queries,
+            num_feature_levels=num_feature_levels,
+            with_box_refine=True,
+        )
+
+    def test_monodetr_forward(self, model):
+        # Create a sample input
+        images = torch.randn(2, 3, 224, 224)
+        calibs = torch.randn(2, 3, 4)
+        img_sizes = torch.tensor([[224, 224], [224, 224]])
+        # Perform forward pass
+        output = model(images, calibs, img_sizes, mode="predict")
+
+        # Check the output
+        assert "scores" in output
+        assert "boxes_3d" in output
+        assert "size_3d" in output
+        assert "depth" in output
+        assert "heading_angle" in output
+        assert "pred_depth_map_logits" in output
+        assert "aux_outputs" in output
+
+        # Check the shape of the output tensors
+        assert output["scores"].shape == (2, 550, 2)
+        assert output["boxes_3d"].shape == (2, 550, 6)
+        assert output["size_3d"].shape == (2, 550, 3)
+        assert output["depth"].shape == (2, 550, 2)
+        assert output["heading_angle"].shape == (2, 550, 24)
+        assert output["pred_depth_map_logits"].shape == (2, 81, 14, 14)
+
+        # Check error handling when loss is None
+        with pytest.raises(ValueError):  # noqa: PT011
+            output = model(images, calibs, img_sizes, mode="loss")
+
+        # Check the export mode
+        export_output = model(images, calibs, img_sizes, mode="export")
+        assert "scores" in export_output
+        assert "boxes_3d" in export_output
+        assert export_output["scores"].shape == (2, 550, 2)
+        assert export_output["scores"].min() >= 0
+        assert export_output["scores"].max() <= 1
diff --git a/tests/unit/algo/object_detection_3d/heads/__init__.py b/tests/unit/algo/object_detection_3d/heads/__init__.py
new file mode 100644
index 00000000000..fa4f5a8e834
--- /dev/null
+++ b/tests/unit/algo/object_detection_3d/heads/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Test of OTX Object Detection 3D heads."""
diff --git a/tests/unit/algo/object_detection_3d/heads/test_depth_predictor.py b/tests/unit/algo/object_detection_3d/heads/test_depth_predictor.py
new file mode 100644
index 00000000000..5341e705b96
--- /dev/null
+++ b/tests/unit/algo/object_detection_3d/heads/test_depth_predictor.py
@@ -0,0 +1,43 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Test DepthPredictor."""
+
+import pytest
+import torch
+from otx.algo.object_detection_3d.heads.depth_predictor import DepthPredictor
+
+
+class TestDepthPredictor:
+    @pytest.fixture()
+    def depth_predictor(self):
+        return DepthPredictor(depth_num_bins=10, depth_min=0.0, depth_max=1.0, hidden_dim=256)
+
+    def test_depth_predictor_forward(self, depth_predictor):
+        feature = [
+            torch.randn(1, 256, 48, 160),
+            torch.randn(1, 256, 24, 80),
+            torch.randn(1, 256, 12, 40),
+            torch.randn(1, 256, 6, 20),
+        ]
+        mask = torch.randn(1, 24, 80)
+        pos = torch.randn(1, 256, 24, 80)
+
+        depth_logits, depth_embed, weighted_depth, depth_pos_embed_ip = depth_predictor(feature, mask, pos)
+
+        assert depth_logits.shape == (1, 11, 24, 80)
+        assert depth_embed.shape == (1, 256, 24, 80)
+        assert weighted_depth.shape == (1, 24, 80)
+        assert depth_pos_embed_ip.shape == (1, 256, 24, 80)
+
+    def test_depth_predictor_interpolate_depth_embed(self, depth_predictor):
+        depth = torch.randn(1, 8, 8)
+        interpolated_depth_embed = depth_predictor.interpolate_depth_embed(depth)
+
+        assert interpolated_depth_embed.shape == (1, 256, 8, 8)
+
+    def test_depth_predictor_interpolate_1d(self, depth_predictor):
+        coord = torch.randn(1, 8, 8).clamp(min=0, max=1)
+        interpolated_embeddings = depth_predictor.interpolate_1d(coord, depth_predictor.depth_pos_embed)
+
+        assert interpolated_embeddings.shape == (1, 8, 8, 256)
diff --git a/tests/unit/algo/object_detection_3d/heads/test_depthaware_transformer.py b/tests/unit/algo/object_detection_3d/heads/test_depthaware_transformer.py
new file mode 100644
index 00000000000..ddc6c234fac
--- /dev/null
+++ b/tests/unit/algo/object_detection_3d/heads/test_depthaware_transformer.py
@@ -0,0 +1,67 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""test depth aware transformer head for 3d object detection."""
+
+import pytest
+import torch
+from otx.algo.object_detection_3d.heads.depthaware_transformer import (
+    DepthAwareTransformerBuilder,
+)
+
+
+class TestDepthAwareTransformer:
+    @pytest.fixture()
+    def depth_aware_transformer(self):
+        return DepthAwareTransformerBuilder("monodetr_50")
+
+    def test_depth_aware_transformer_forward(self, depth_aware_transformer):
+        # Create dummy input tensors
+        srcs = [
+            torch.randn(1, 256, 48, 160),
+            torch.randn(1, 256, 24, 80),
+            torch.randn(1, 256, 12, 40),
+            torch.randn(1, 256, 6, 20),
+        ]
+        masks = [
+            torch.randn(1, 48, 160) < 0,
+            torch.randn(1, 24, 80) < 0,
+            torch.randn(1, 12, 40) < 0,
+            torch.randn(1, 6, 20) < 0,
+        ]
+        pos_embeds = [
+            torch.randn(1, 256, 48, 160),
+            torch.randn(1, 256, 24, 80),
+            torch.randn(1, 256, 12, 40),
+            torch.randn(1, 256, 6, 20),
+        ]
+        query_embed = torch.randn(550, 512)
+        depth_pos_embed = torch.randn(1, 256, 24, 80)
+        depth_pos_embed_ip = torch.randn(1, 256, 24, 80)
+        attn_mask = None
+        depth_aware_transformer.decoder.return_intermediate = False
+        output = depth_aware_transformer.forward(
+            srcs,
+            masks,
+            pos_embeds,
+            query_embed,
+            depth_pos_embed,
+            depth_pos_embed_ip,
+            attn_mask,
+        )
+
+        # Check output shape
+        assert len(output) == 6
+        assert output[0].shape == (1, 550, 256)
+        assert output[2].shape == (1, 550, 2)
+        assert output[4] is None
+
+    def test_depth_aware_transformer_get_valid_ratio(self, depth_aware_transformer):
+        # Create dummy input tensor
+        mask = torch.randn(2, 32, 32) > 0
+
+        # Get valid ratio
+        valid_ratio = depth_aware_transformer.get_valid_ratio(mask)
+
+        # Check output shape
+        assert valid_ratio.shape == (2, 2)
diff --git a/tests/unit/algo/object_detection_3d/losses/__init__.py b/tests/unit/algo/object_detection_3d/losses/__init__.py
new file mode 100644
index 00000000000..317723ad150
--- /dev/null
+++ b/tests/unit/algo/object_detection_3d/losses/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Test of OTX Object Detection 3D losses."""
diff --git a/tests/unit/algo/object_detection_3d/losses/test_monodetr_loss.py b/tests/unit/algo/object_detection_3d/losses/test_monodetr_loss.py
new file mode 100644
index 00000000000..f041445619a
--- /dev/null
+++ b/tests/unit/algo/object_detection_3d/losses/test_monodetr_loss.py
@@ -0,0 +1,101 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Unit test for MonoDETR loss."""
+import torch
+from otx.algo.object_detection_3d.losses.monodetr_loss import MonoDETRCriterion
+
+
+class TestMonoDETRCriterion:
+    def test_loss_labels(self):
+        criterion = MonoDETRCriterion(num_classes=10, weight_dict={}, focal_alpha=0.5)
+        outputs = {
+            "scores": torch.randn(2, 10, 10),
+        }
+        targets = [
+            {"labels": torch.tensor([1, 2, 0, 0, 0, 0, 1, 2, 0, 0])},
+            {"labels": torch.tensor([3, 4, 0, 0, 0, 0, 3, 4, 0, 0])},
+        ]
+        indices = [
+            (torch.tensor([0, 1]), torch.tensor([0, 1])),
+            (torch.tensor([0, 1]), torch.tensor([0, 1])),
+        ]
+        num_boxes = 4
+
+        loss = criterion.loss_labels(outputs, targets, indices, num_boxes)
+        assert "loss_ce" in loss
+        assert isinstance(loss["loss_ce"], torch.Tensor)
+
+    def test_loss_3dcenter(self):
+        criterion = MonoDETRCriterion(num_classes=10, weight_dict={}, focal_alpha=0.5)
+        outputs = {
+            "boxes_3d": torch.randn(2, 10, 4),
+        }
+        targets = [
+            {"boxes_3d": torch.tensor([[1, 2], [3, 4]])},
+            {"boxes_3d": torch.tensor([[5, 6], [7, 8]])},
+        ]
+        indices = [
+            (torch.tensor([0, 1]), torch.tensor([0, 1])),
+            (torch.tensor([0, 1]), torch.tensor([0, 1])),
+        ]
+        num_boxes = 4
+
+        loss = criterion.loss_3dcenter(outputs, targets, indices, num_boxes)
+        assert "loss_center" in loss
+        assert isinstance(loss["loss_center"], torch.Tensor)
+
+    def test_forward(self):
+        criterion = MonoDETRCriterion(num_classes=10, weight_dict={}, focal_alpha=0.5)
+        outputs = {
+            "scores": torch.randn(1, 100, 10),
+            "boxes_3d": torch.randn(1, 100, 6),
+            "depth": torch.randn(1, 100, 2),
+            "size_3d": torch.randn(1, 100, 3),
+            "heading_angle": torch.randn(1, 100, 24),
+            "pred_depth_map_logits": torch.randn(1, 100, 80, 80),
+        }
+        targets = [
+            {
+                "labels": torch.tensor([0, 0, 0, 0]),
+                "boxes": torch.tensor(
+                    [
+                        [0.7697, 0.4923, 0.0398, 0.0663],
+                        [0.7371, 0.4857, 0.0339, 0.0620],
+                        [0.7126, 0.4850, 0.0246, 0.0501],
+                        [0.5077, 0.5280, 0.0444, 0.1475],
+                    ],
+                ),
+                "depth": torch.tensor([[47.5800], [55.2600], [62.3900], [23.7700]]),
+                "size_3d": torch.tensor(
+                    [
+                        [1.5500, 1.3700, 3.9700],
+                        [1.6900, 1.7400, 3.7600],
+                        [1.5500, 1.3900, 3.5500],
+                        [1.6200, 1.6300, 4.5000],
+                    ],
+                ),
+                "heading_angle": torch.tensor(
+                    [
+                        [2.0000e00, 4.6737e-02],
+                        [8.0000e00, 1.2180e-01],
+                        [8.0000e00, 1.5801e-01],
+                        [9.0000e00, 1.8260e-04],
+                    ],
+                ),
+                "boxes_3d": torch.tensor(
+                    [
+                        [0.7689, 0.4918, 0.0191, 0.0208, 0.0327, 0.0336],
+                        [0.7365, 0.4858, 0.0163, 0.0175, 0.0310, 0.0310],
+                        [0.7122, 0.4848, 0.0118, 0.0127, 0.0248, 0.0252],
+                        [0.5089, 0.5234, 0.0235, 0.0209, 0.0693, 0.0783],
+                    ],
+                ),
+            },
+        ]
+
+        losses = criterion.forward(outputs, targets)
+        assert isinstance(losses, dict)
+        assert len(losses) == 8
+        for loss in losses.values():
+            assert isinstance(loss, torch.Tensor)
diff --git a/tests/unit/algo/object_detection_3d/matchers/__init__.py b/tests/unit/algo/object_detection_3d/matchers/__init__.py
new file mode 100644
index 00000000000..9df782e1015
--- /dev/null
+++ b/tests/unit/algo/object_detection_3d/matchers/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Test of OTX Object Detection 3D matchers."""
diff --git a/tests/unit/algo/object_detection_3d/matchers/test_matcher_3d.py b/tests/unit/algo/object_detection_3d/matchers/test_matcher_3d.py
new file mode 100644
index 00000000000..c2173bd411a
--- /dev/null
+++ b/tests/unit/algo/object_detection_3d/matchers/test_matcher_3d.py
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Test for HungarianMatcher3D module."""
+
+import pytest
+import torch
+from otx.algo.object_detection_3d.matchers.matcher_3d import HungarianMatcher3D
+
+
+class TestHungarianMatcher3D:
+    @pytest.fixture()
+    def matcher(self):
+        return HungarianMatcher3D()
+
+    def test_hungarian_matcher_3d(self, matcher):
+        outputs = {
+            "scores": torch.randn(1, 100, 10),
+            "boxes_3d": torch.randn(1, 100, 6),
+        }
+        targets = [
+            {
+                "labels": torch.tensor([0, 0, 0, 0]),
+                "boxes": torch.tensor(
+                    [
+                        [0.7697, 0.4923, 0.0398, 0.0663],
+                        [0.7371, 0.4857, 0.0339, 0.0620],
+                        [0.7126, 0.4850, 0.0246, 0.0501],
+                        [0.5077, 0.5280, 0.0444, 0.1475],
+                    ],
+                ),
+                "boxes_3d": torch.tensor(
+                    [
+                        [0.7689, 0.4918, 0.0191, 0.0208, 0.0327, 0.0336],
+                        [0.7365, 0.4858, 0.0163, 0.0175, 0.0310, 0.0310],
+                        [0.7122, 0.4848, 0.0118, 0.0127, 0.0248, 0.0252],
+                        [0.5089, 0.5234, 0.0235, 0.0209, 0.0693, 0.0783],
+                    ],
+                ),
+            },
+        ]
+        group_num = 11
+
+        result = matcher(outputs, targets, group_num)
+
+        assert len(result) == 1
+        assert isinstance(result[0][0], torch.Tensor)
+        assert isinstance(result[0][1], torch.Tensor)
+        assert len(result[0][0].tolist()) == 44
+        assert len(result[0][1].tolist()) == 44
+        assert torch.max(torch.stack(result[0])) <= 100
diff --git a/tests/unit/algo/object_detection_3d/test_monodetr3d.py b/tests/unit/algo/object_detection_3d/test_monodetr3d.py
new file mode 100644
index 00000000000..6e9ce559895
--- /dev/null
+++ b/tests/unit/algo/object_detection_3d/test_monodetr3d.py
@@ -0,0 +1,48 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+"""Test of OTX MonoDETR3D architecture."""
+
+import pytest
+import torch
+from otx.algo.object_detection_3d.monodetr3d import MonoDETR3D
+from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity
+from otx.core.exporter.detection_3d import OTXObjectDetection3DExporter
+from otx.core.types.export import TaskLevelExportParameters
+
+
+class TestMonoDETR3D:
+    @pytest.fixture()
+    def model(self):
+        return MonoDETR3D(model_name="monodetr_50", label_info=2, input_size=(1280, 384))
+
+    def test_init(self, model) -> None:
+        assert isinstance(model._export_parameters, TaskLevelExportParameters)
+        assert isinstance(model._exporter, OTXObjectDetection3DExporter)
+
+    def test_loss(self, model, fxt_data_module_3d):
+        data = next(iter(fxt_data_module_3d.train_dataloader()))
+        output = model(data)
+        assert "loss_ce" in output
+        assert "loss_bbox" in output
+        assert "loss_center" in output
+        assert "loss_center_aux_1" in output
+        for loss in output.values():
+            assert loss is not None
+            assert isinstance(loss, torch.Tensor)
+
+    def test_predict(self, model, fxt_data_module_3d):
+        data = next(iter(fxt_data_module_3d.train_dataloader()))
+        model.eval()
+        output = model(data)
+        assert isinstance(output, Det3DBatchDataEntity)
+
+    def test_export(self, model):
+        model.eval()
+        output = model.forward_for_tracing(
+            torch.randn(1, 3, 384, 1280),
+            torch.randn(1, 3, 4),
+            torch.tensor([[1280, 384]]),
+        )
+        assert isinstance(output, dict)
+        assert len(output) == 5
+        assert list(output.keys()) == ["scores", "boxes_3d", "size_3d", "depth", "heading_angle"]
diff --git a/tests/unit/algo/segmentation/segmentors/test_base_model.py b/tests/unit/algo/segmentation/segmentors/test_base_model.py
index d970ead0c32..30893e5182e 100644
--- a/tests/unit/algo/segmentation/segmentors/test_base_model.py
+++ b/tests/unit/algo/segmentation/segmentors/test_base_model.py
@@ -43,8 +43,10 @@ def test_forward_returns_prediction(self, model, inputs):
     def test_extract_features(self, model, inputs):
         images = inputs[0]
         features = model.extract_features(images)
-        assert isinstance(features, torch.Tensor)
-        assert features.shape == (1, 2, 256, 256)
+        assert isinstance(features, tuple)
+        assert isinstance(features[0], torch.Tensor)
+        assert isinstance(features[1], torch.Tensor)
+        assert features[1].shape == (1, 2, 256, 256)
 
     def test_calculate_loss(self, model, inputs):
         model.criterion.name = "CrossEntropyLoss"
diff --git a/tests/unit/core/data/dataset/test_keypoint_detection.py b/tests/unit/core/data/dataset/test_keypoint_detection.py
index 6bc19469d37..87fe62b27d4 100644
--- a/tests/unit/core/data/dataset/test_keypoint_detection.py
+++ b/tests/unit/core/data/dataset/test_keypoint_detection.py
@@ -19,7 +19,7 @@ def fxt_dm_dataset(self) -> DmDataset:
         return DmDataset.import_from("tests/assets/car_tree_bug_keypoint", format="coco_person_keypoints")
 
     @pytest.fixture()
-    def fxt_tvt_transforms(self, mocker) -> Identity:
+    def fxt_tvt_transforms(self) -> Identity:
         return Identity()
 
     @pytest.mark.parametrize("subset", ["train", "val"])
diff --git a/tests/unit/core/data/dataset/test_object_detection_3d.py b/tests/unit/core/data/dataset/test_object_detection_3d.py
new file mode 100644
index 00000000000..f75004a45d4
--- /dev/null
+++ b/tests/unit/core/data/dataset/test_object_detection_3d.py
@@ -0,0 +1,58 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Unit tests of Object Detection 3D datasets."""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+from datumaro import Dataset as DmDataset
+from otx.core.data.dataset.object_detection_3d import OTX3DObjectDetectionDataset
+from otx.core.data.entity.base import ImageInfo
+from torchvision.transforms.v2 import Identity, Transform
+
+
+class TestOTXObjectDetection3DDataset:
+    @pytest.fixture()
+    def fxt_dm_dataset(self) -> DmDataset:
+        return DmDataset.import_from("tests/assets/kitti3d", format="kitti3d")
+
+    @pytest.fixture()
+    def fxt_tvt_transforms(self) -> Identity:
+        return Identity()
+
+    @pytest.mark.parametrize("subset", ["train", "val"])
+    def test_get_item_impl_subset(
+        self,
+        fxt_dm_dataset,
+        fxt_tvt_transforms: Transform,
+        subset: str,
+    ) -> None:
+        dataset = OTX3DObjectDetectionDataset(
+            fxt_dm_dataset.get_subset(subset).as_dataset(),
+            fxt_tvt_transforms,
+        )
+
+        entity = dataset._get_item_impl(0)
+
+        assert hasattr(entity, "image")
+        assert isinstance(entity.image, np.ndarray)
+        assert hasattr(entity, "img_info")
+        assert isinstance(entity.img_info, ImageInfo)
+        assert hasattr(entity, "calib_matrix")
+        assert isinstance(entity.calib_matrix, np.ndarray)
+        assert hasattr(entity, "boxes_3d")
+        assert isinstance(entity.boxes_3d, np.ndarray)
+        assert hasattr(entity, "boxes")
+        assert isinstance(entity.boxes, np.ndarray)
+        assert hasattr(entity, "size_2d")
+        assert isinstance(entity.boxes_3d, np.ndarray)
+        assert hasattr(entity, "size_3d")
+        assert isinstance(entity.boxes_3d, np.ndarray)
+        assert hasattr(entity, "heading_angle")
+        assert isinstance(entity.boxes_3d, np.ndarray)
+        assert hasattr(entity, "depth")
+        assert isinstance(entity.boxes_3d, np.ndarray)
+        assert hasattr(entity, "original_kitti_format")
+        assert isinstance(entity.original_kitti_format, dict)
diff --git a/tests/unit/core/data/dataset/test_segmentation.py b/tests/unit/core/data/dataset/test_segmentation.py
index 141dc4bf74b..c7e35d0a924 100644
--- a/tests/unit/core/data/dataset/test_segmentation.py
+++ b/tests/unit/core/data/dataset/test_segmentation.py
@@ -19,7 +19,7 @@ def test_get_item(
             max_refetch=3,
         )
         assert isinstance(dataset[0], SegDataEntity)
-        assert "background" in [label_name.lower() for label_name in dataset.label_info.label_names]
+        assert "otx_background_lbl" in [label_name.lower() for label_name in dataset.label_info.label_names]
 
     def test_get_item_from_bbox_dataset(
         self,
@@ -33,4 +33,4 @@ def test_get_item_from_bbox_dataset(
         )
         assert isinstance(dataset[0], SegDataEntity)
         # OTXSegmentationDataset should add background when getting a dataset which includes only bbox annotations
-        assert "background" in [label_name.lower() for label_name in dataset.label_info.label_names]
+        assert "otx_background_lbl" in [label_name.lower() for label_name in dataset.label_info.label_names]
diff --git a/tests/unit/core/data/transform_libs/test_torchvision.py b/tests/unit/core/data/transform_libs/test_torchvision.py
index 1a1363d6821..3aad061118e 100644
--- a/tests/unit/core/data/transform_libs/test_torchvision.py
+++ b/tests/unit/core/data/transform_libs/test_torchvision.py
@@ -17,10 +17,12 @@
 from otx.core.data.entity.detection import DetBatchDataEntity, DetDataEntity
 from otx.core.data.entity.instance_segmentation import InstanceSegBatchDataEntity, InstanceSegDataEntity
 from otx.core.data.entity.keypoint_detection import KeypointDetDataEntity
+from otx.core.data.entity.object_detection_3d import Det3DDataEntity
 from otx.core.data.transform_libs.torchvision import (
     CachedMixUp,
     CachedMosaic,
     Compose,
+    Decode3DInputsAffineTransforms,
     DecodeVideo,
     FilterAnnotations,
     GetBBoxCenterScale,
@@ -918,3 +920,149 @@ def test_forward(self, keypoint_det_entity) -> None:
         assert np.array_equal(results.bbox_info.center, np.array([3.5, 3.5]))
         assert np.array_equal(results.bbox_info.scale, np.array([8.75, 8.75]))
         assert results.keypoints.shape == (4, 2)
+
+
+class TestDecode3DInputsAffineTransforms:
+    @pytest.fixture()
+    def decode_transform(self) -> Decode3DInputsAffineTransforms:
+        return Decode3DInputsAffineTransforms(input_size=(380, 1280), decode_annotations=True)
+
+    @pytest.fixture()
+    def original_kitti_format(self) -> dict[str, np.array]:
+        return {
+            "name": np.array([0]),
+            "alpha": np.array([1.55]),
+            "bbox": np.array([[614.23999023, 181.77999878, 727.30999756, 284.76998901]]),
+            "dimensions": np.array([[4.15, 1.57, 1.73]]),
+            "location": np.array([[1.0, 1.75, 13.22]]),
+            "rotation_y": np.array([1.62]),
+            "occluded": np.array([0]),
+            "truncated": np.array([0.0]),
+        }
+
+    @pytest.fixture()
+    def det_3d_data_entity(self, original_kitti_format) -> Det3DDataEntity:
+        return Det3DDataEntity(
+            image=np.random.rand(725, 1920, 3),
+            img_info=ImageInfo(
+                img_idx=0,
+                img_shape=(380, 1280),
+                ori_shape=(725, 1920),
+                image_color_channel=True,
+                ignored_labels=[],
+            ),
+            boxes=np.zeros((50, 4), dtype=np.float32),
+            labels=np.zeros((50), dtype=np.int8),
+            calib_matrix=np.array(
+                [
+                    [721.5377, 0.0, 609.5593, 44.85728],
+                    [0.0, 721.5377, 172.854, 0.2163791],
+                    [0.0, 0.0, 1.0, 0.002745884],
+                ],
+            ),
+            boxes_3d=np.zeros((50, 6), dtype=np.float32),
+            size_2d=np.zeros((50, 2), dtype=np.float32),
+            size_3d=np.zeros((50, 3), dtype=np.float32),
+            depth=np.zeros((50, 1), dtype=np.float32),
+            heading_angle=np.zeros((50, 2), dtype=np.float32),
+            original_kitti_format=deepcopy(original_kitti_format),
+        )
+
+    def test_general_call(
+        self,
+        decode_transform: Decode3DInputsAffineTransforms,
+        det_3d_data_entity: Det3DDataEntity,
+        original_kitti_format: dict[str, np.array],
+    ) -> None:
+        """Test __call__."""
+        results = decode_transform(det_3d_data_entity)
+
+        assert results.image.shape == (3, 380, 1280)
+        assert results.labels.dtype == torch.long
+        for key in ["boxes", "boxes_3d", "size_2d", "size_3d", "depth", "heading_angle"]:
+            assert hasattr(results, key)
+            assert getattr(results, key).size()[0] == 1  # only one object
+            if key != "boxes":
+                assert isinstance(getattr(results, key), torch.Tensor)
+                assert getattr(results, key).dtype == torch.float32
+            else:
+                assert isinstance(getattr(results, key), tv_tensors.BoundingBoxes)
+
+        assert results.boxes.format == tv_tensors.BoundingBoxFormat.XYXY
+        assert results.boxes_3d.shape == (1, 6)
+        assert results.calib_matrix.shape == (3, 4)
+        # dimensions are in the right position and differ from original_kitti_format
+        assert original_kitti_format["dimensions"][0, 0] == results.size_3d[0, 2]
+
+    def test_no_decode_annotations(
+        self,
+        decode_transform: Decode3DInputsAffineTransforms,
+        det_3d_data_entity: Det3DDataEntity,
+        mocker,
+    ) -> None:
+        """Test __call__."""
+        decode_transform.decode_annotations = False
+        results = decode_transform(det_3d_data_entity)
+
+        assert results.image.shape == (3, 380, 1280)
+        assert isinstance(results.image, torch.Tensor)
+        for key in ["boxes", "boxes_3d", "size_2d", "size_3d", "depth", "heading_angle"]:
+            assert hasattr(results, key)
+            assert getattr(results, key).size()[0] == 0  # all annotations filtered
+            if key != "boxes":
+                assert isinstance(getattr(results, key), torch.Tensor)
+            else:
+                assert isinstance(getattr(results, key), tv_tensors.BoundingBoxes)
+        assert results.calib_matrix.shape == (3, 4)
+        assert isinstance(results.calib_matrix, torch.Tensor)
+
+    def test_no_input_size(
+        self,
+        decode_transform: Decode3DInputsAffineTransforms,
+        det_3d_data_entity: Det3DDataEntity,
+        mocker,
+    ) -> None:
+        # no resize and affine transforms
+        decode_transform.input_size = None
+        decode_transform._affine_transforms = mocker.MagicMock()
+        results = decode_transform(det_3d_data_entity)
+        assert results.image.shape == (3, 725, 1920)  # no resize
+        assert isinstance(results.image, torch.Tensor)
+        assert decode_transform._affine_transforms.call_count == 0
+
+    def test_affine_transforms(self, decode_transform):
+        inputs = {
+            "image": np.random.rand(480, 640, 3),
+            "ori_shape": np.array([480, 640]),
+        }
+        transformed_inputs_0 = decode_transform._affine_transforms(inputs["image"], inputs["ori_shape"], (256, 256))
+
+        assert transformed_inputs_0[0].shape == (3, 256, 256)
+        assert transformed_inputs_0[0].dtype == torch.float32
+        assert transformed_inputs_0[1] == 1  # no crop
+        assert transformed_inputs_0[2].shape == (2, 3)
+        assert isinstance(transformed_inputs_0[3], bool)
+        assert not transformed_inputs_0[3]
+
+        # test crop
+        decode_transform.random_crop = True
+        decode_transform.p_crop = 1.0
+        transformed_inputs_1 = decode_transform._affine_transforms(inputs["image"], inputs["ori_shape"], (256, 256))
+
+        assert transformed_inputs_1[0].shape == (3, 256, 256)
+        assert transformed_inputs_1[2].shape == (2, 3)
+        assert np.any(transformed_inputs_1[2] != transformed_inputs_0[2])
+        assert transformed_inputs_1[1] != 1
+        assert not transformed_inputs_1[3]
+
+        # test flip
+        decode_transform.random_crop = False
+        decode_transform.random_horizontal_flip = True
+        decode_transform.p_flip = 1.0
+        transformed_inputs_2 = decode_transform._affine_transforms(inputs["image"], inputs["ori_shape"], (256, 256))
+
+        assert transformed_inputs_2[0].shape == (3, 256, 256)
+        assert transformed_inputs_2[2].shape == (2, 3)
+        assert np.all(transformed_inputs_2[2] == transformed_inputs_0[2])
+        assert transformed_inputs_2[1] == 1  # no crop
+        assert transformed_inputs_2[3]  # flip is True
diff --git a/tests/unit/core/exporter/test_detection_3d.py b/tests/unit/core/exporter/test_detection_3d.py
new file mode 100644
index 00000000000..98564615f61
--- /dev/null
+++ b/tests/unit/core/exporter/test_detection_3d.py
@@ -0,0 +1,70 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+"""Unit tests of visual prompting exporter."""
+
+from unittest.mock import MagicMock
+
+import pytest
+import torch
+from otx.core.exporter.detection_3d import OTXObjectDetection3DExporter
+from otx.core.types.export import OTXExportFormatType
+
+
+class TestOTXVisualPromptingModelExporter:
+    @pytest.fixture()
+    def otx_detection_3d_exporter(self) -> OTXObjectDetection3DExporter:
+        return OTXObjectDetection3DExporter(
+            task_level_export_parameters=MagicMock(),
+            input_size=(10, 10),
+        )
+
+    def test_export_openvino(self, mocker, tmpdir, otx_detection_3d_exporter) -> None:
+        """Test export for OPENVINO."""
+        mocker_openvino_convert_model = mocker.patch("openvino.convert_model")
+        mocker_postprocess_openvino_model = mocker.patch.object(
+            otx_detection_3d_exporter,
+            "_postprocess_openvino_model",
+        )
+        mocker_openvino_save_model = mocker.patch("openvino.save_model")
+        mock_model = mocker.MagicMock()
+        mock_model.parameters.return_value = iter([torch.rand(1, 3)])
+
+        otx_detection_3d_exporter.export(
+            model=mock_model,
+            output_dir=tmpdir,
+            export_format=OTXExportFormatType.OPENVINO,
+        )
+
+        mocker_openvino_convert_model.assert_called()
+        mocker_postprocess_openvino_model.assert_called()
+        mocker_openvino_save_model.assert_called()
+
+        with pytest.raises(NotImplementedError):
+            otx_detection_3d_exporter.export(
+                model=mock_model,
+                output_dir=tmpdir,
+                export_format=OTXExportFormatType.OPENVINO,
+                to_exportable_code=True,
+            )
+
+    def test_export_onnx(self, mocker, tmpdir, otx_detection_3d_exporter) -> None:
+        """Test export for ONNX."""
+        mocker_torch_onnx_export = mocker.patch("torch.onnx.export")
+        mocker_onnx_load = mocker.patch("onnx.load")
+        mocker_onnx_save = mocker.patch("onnx.save")
+        mocker_postprocess_onnx_model = mocker.patch.object(
+            otx_detection_3d_exporter,
+            "_postprocess_onnx_model",
+        )
+        mock_model = mocker.MagicMock()
+
+        otx_detection_3d_exporter.export(
+            model=mock_model,
+            output_dir=tmpdir,
+            export_format=OTXExportFormatType.ONNX,
+        )
+
+        mocker_torch_onnx_export.assert_called()
+        mocker_onnx_load.assert_called()
+        mocker_onnx_save.assert_called()
+        mocker_postprocess_onnx_model.assert_called()
diff --git a/tests/unit/core/metrics/test_accuracy.py b/tests/unit/core/metrics/test_accuracy.py
index 8370fee09f6..d3c43a8a087 100644
--- a/tests/unit/core/metrics/test_accuracy.py
+++ b/tests/unit/core/metrics/test_accuracy.py
@@ -13,7 +13,7 @@
     MultilabelAccuracywithLabelGroup,
 )
 from otx.core.types.label import HLabelInfo, LabelInfo
-from torchmetrics.classification.accuracy import BinaryAccuracy, MulticlassAccuracy
+from torchmetrics.classification.accuracy import BinaryAccuracy, MulticlassAccuracy, MultilabelAccuracy
 
 
 class TestAccuracy:
@@ -120,3 +120,28 @@ def test_multilabel_only(self) -> None:
                 head_logits_info={"head1": (0, 5), "head2": (5, 10)},
                 threshold_multilabel=0.5,
             )
+
+    def test_multilabel_accuracy(self, hlabel_accuracy) -> None:
+        # Normal Case: num_multilabel_classes > 1 -> MultilabelAccuracy
+        assert hlabel_accuracy.num_multilabel_classes == 3
+        assert isinstance(hlabel_accuracy.multilabel_accuracy, MultilabelAccuracy)
+
+        # Edge Case: num_multilabel_classes = 1 -> BinaryAccuracy
+        acc = MixedHLabelAccuracy(
+            num_multiclass_heads=2,
+            num_multilabel_classes=1,
+            head_logits_info={"head1": (0, 5), "head2": (5, 10)},
+            threshold_multilabel=0.5,
+        )
+        assert acc.num_multilabel_classes == 1
+        assert isinstance(acc.multilabel_accuracy, BinaryAccuracy)
+
+        # None Case: num_multilabel_classes = 0 -> None
+        acc = MixedHLabelAccuracy(
+            num_multiclass_heads=2,
+            num_multilabel_classes=0,
+            head_logits_info={"head1": (0, 5), "head2": (5, 10)},
+            threshold_multilabel=0.5,
+        )
+        assert acc.num_multilabel_classes == 0
+        assert acc.multilabel_accuracy is None
diff --git a/tests/unit/core/model/test_detection_3d.py b/tests/unit/core/model/test_detection_3d.py
new file mode 100644
index 00000000000..f46dc212b8d
--- /dev/null
+++ b/tests/unit/core/model/test_detection_3d.py
@@ -0,0 +1,127 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Unit tests for keypoint detection model entity."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+import torch
+from otx.algo.object_detection_3d.monodetr3d import MonoDETR3D
+from otx.core.data.entity.base import OTXBatchLossEntity
+from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DBatchPredEntity
+from otx.core.metrics.average_precision_3d import KittiMetric
+from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable
+from otx.core.types.label import LabelInfo
+
+if TYPE_CHECKING:
+    from otx.core.model.detection_3d import OTX3DDetectionModel
+
+
+class TestOTX3DDetectionModel:
+    @pytest.fixture()
+    def model(self, label_info, optimizer, scheduler, metric, torch_compile) -> OTX3DDetectionModel:
+        return MonoDETR3D(label_info, "monodetr_50", (1280, 384), optimizer, scheduler, metric, torch_compile)
+
+    @pytest.fixture()
+    def batch_data_entity(self, model) -> Det3DBatchDataEntity:
+        return model.get_dummy_input(2)
+
+    @pytest.fixture()
+    def label_info(self) -> LabelInfo:
+        return LabelInfo(
+            label_names=["label_0", "label_1"],
+            label_groups=[["label_0", "label_1"]],
+        )
+
+    @pytest.fixture()
+    def optimizer(self):
+        return DefaultOptimizerCallable
+
+    @pytest.fixture()
+    def scheduler(self):
+        return DefaultSchedulerCallable
+
+    @pytest.fixture()
+    def metric(self):
+        return KittiMetric
+
+    @pytest.fixture()
+    def torch_compile(self):
+        return False
+
+    def test_export_parameters(self, model):
+        params = model._export_parameters
+        assert params.model_type == "mono_3d_det"
+        assert params.task_type == "3d_detection"
+
+    @pytest.mark.parametrize(
+        ("label_info", "expected_label_info"),
+        [
+            (
+                LabelInfo(label_names=["label1", "label2", "label3"], label_groups=[["label1", "label2", "label3"]]),
+                LabelInfo(label_names=["label1", "label2", "label3"], label_groups=[["label1", "label2", "label3"]]),
+            ),
+            (LabelInfo.from_num_classes(num_classes=5), LabelInfo.from_num_classes(num_classes=5)),
+        ],
+    )
+    def test_dispatch_label_info(self, model, label_info, expected_label_info):
+        result = model._dispatch_label_info(label_info)
+        assert result == expected_label_info
+
+    def test_init(self, model):
+        assert model.num_classes == 2
+
+    def test_customize_inputs(self, model, batch_data_entity):
+        customized_inputs = model._customize_inputs(batch_data_entity)
+        assert customized_inputs["images"].shape == (2, 3, model.input_size[0], model.input_size[1])
+        assert "mode" in customized_inputs
+        assert "calibs" in customized_inputs
+        assert customized_inputs["calibs"].shape == (2, 3, 4)
+
+    def test_customize_outputs_training(self, model, batch_data_entity):
+        outputs = {"loss": torch.tensor(0.5)}
+        customized_outputs = model._customize_outputs(outputs, batch_data_entity)
+        assert isinstance(customized_outputs, OTXBatchLossEntity)
+        assert customized_outputs["loss"] == torch.tensor(0.5)
+
+    def test_customize_outputs_predict(self, model, batch_data_entity):
+        model.training = False
+        outputs = {
+            "scores": torch.randn(2, 50, 2),
+            "boxes_3d": torch.randn(2, 50, 6),
+            "boxes": torch.randn(2, 50, 4),
+            "size_3d": torch.randn(2, 50, 3),
+            "depth": torch.randn(2, 50, 2),
+            "heading_angle": torch.randn(2, 50, 24),
+        }
+        customized_outputs = model._customize_outputs(outputs, batch_data_entity)
+        assert isinstance(customized_outputs, Det3DBatchPredEntity)
+        assert hasattr(customized_outputs, "scores")
+        assert hasattr(customized_outputs, "heading_angle")
+        assert hasattr(customized_outputs, "boxes")
+        assert hasattr(customized_outputs, "size_2d")
+        assert len(customized_outputs.boxes_3d) == len(customized_outputs.scores)
+
+    def test_dummy_input(self, model: OTX3DDetectionModel):
+        batch_size = 2
+        batch = model.get_dummy_input(batch_size)
+        assert batch.batch_size == batch_size
+
+    def test_convert_pred_entity_to_compute_metric(self, model: OTX3DDetectionModel, batch_data_entity):
+        model.training = False
+        outputs = {
+            "scores": torch.randn(2, 50, 2),
+            "boxes_3d": torch.randn(2, 50, 6),
+            "boxes": torch.randn(2, 50, 4),
+            "size_3d": torch.randn(2, 50, 3),
+            "depth": torch.randn(2, 50, 2),
+            "heading_angle": torch.randn(2, 50, 24),
+        }
+        customized_outputs = model._customize_outputs(outputs, batch_data_entity)
+        converted_pred = model._convert_pred_entity_to_compute_metric(customized_outputs, batch_data_entity)
+
+        assert "preds" in converted_pred
+        assert "target" in converted_pred
diff --git a/tests/unit/core/types/test_label.py b/tests/unit/core/types/test_label.py
index 78daec6982e..3ae1ae1f463 100644
--- a/tests/unit/core/types/test_label.py
+++ b/tests/unit/core/types/test_label.py
@@ -1,7 +1,10 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
 
-from otx.core.types.label import NullLabelInfo, SegLabelInfo
+from datumaro import LabelCategories
+from datumaro.components.annotation import GroupType
+from otx.core.types.label import HLabelInfo, NullLabelInfo, SegLabelInfo
 
 
 def test_as_json(fxt_label_info):
@@ -18,3 +21,34 @@ def test_seg_label_info():
     )
     assert SegLabelInfo.from_num_classes(1) == SegLabelInfo(["background", "label_0"], [["background", "label_0"]])
     assert SegLabelInfo.from_num_classes(0) == NullLabelInfo()
+
+
+# Unit test
+def test_hlabel_info():
+    labels = [
+        LabelCategories.Category(name="car", parent="vehicle"),
+        LabelCategories.Category(name="truck", parent="vehicle"),
+        LabelCategories.Category(name="plush toy", parent="plush toy"),
+        LabelCategories.Category(name="No class"),
+    ]
+    label_groups = [
+        LabelCategories.LabelGroup(
+            name="Detection labels___vehicle",
+            labels=["car", "truck"],
+            group_type=GroupType.EXCLUSIVE,
+        ),
+        LabelCategories.LabelGroup(
+            name="Detection labels___plush toy",
+            labels=["plush toy"],
+            group_type=GroupType.EXCLUSIVE,
+        ),
+        LabelCategories.LabelGroup(name="No class", labels=["No class"], group_type=GroupType.RESTRICTED),
+    ]
+    dm_label_categories = LabelCategories(items=labels, label_groups=label_groups)
+
+    hlabel_info = HLabelInfo.from_dm_label_groups(dm_label_categories)
+
+    # Check if class_to_group_idx and label_to_idx have the same keys
+    assert list(hlabel_info.class_to_group_idx.keys()) == list(
+        hlabel_info.label_to_idx.keys(),
+    ), "class_to_group_idx and label_to_idx keys do not match"
diff --git a/tests/unit/engine/adaptive_bs/test_bs_search_algo.py b/tests/unit/engine/adaptive_bs/test_bs_search_algo.py
index fde7ceacda2..f59225e3b8a 100644
--- a/tests/unit/engine/adaptive_bs/test_bs_search_algo.py
+++ b/tests/unit/engine/adaptive_bs/test_bs_search_algo.py
@@ -99,12 +99,19 @@ def test_auto_decrease_batch_size(self):
         assert adapted_bs == 80
 
     def test_find_max_usable_bs_gpu_memory_too_small(self):
-        mock_train_func = self.get_mock_train_func(cuda_oom_bound=4, max_runnable_bs=1)
+        mock_train_func = self.get_mock_train_func(cuda_oom_bound=1, max_runnable_bs=1)
 
         bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000)
         with pytest.raises(RuntimeError):
             bs_search_algo.auto_decrease_batch_size()
 
+    def test_auto_decrease_batch_size_bs2_not_oom_but_most_mem(self):
+        """Batch size 2 doesn't make oom but use most of memory."""
+        mock_train_func = self.get_mock_train_func(cuda_oom_bound=2, max_runnable_bs=1)
+
+        bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000)
+        assert bs_search_algo.auto_decrease_batch_size() == 2
+
     @pytest.mark.parametrize(
         ("max_runnable_bs", "max_bs", "expected_bs"),
         [
@@ -126,12 +133,19 @@ def test_find_big_enough_batch_size(self, max_runnable_bs, max_bs, expected_bs):
             assert adapted_bs == expected_bs
 
     def test_find_big_enough_batch_size_gpu_memory_too_small(self):
-        mock_train_func = self.get_mock_train_func(cuda_oom_bound=4, max_runnable_bs=1)
+        mock_train_func = self.get_mock_train_func(cuda_oom_bound=1, max_runnable_bs=1)
 
         bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000)
         with pytest.raises(RuntimeError):
             bs_search_algo.find_big_enough_batch_size()
 
+    def test_find_big_enough_batch_size_bs2_not_oom_but_most_mem(self):
+        """Batch size 2 doesn't make oom but use most of memory."""
+        mock_train_func = self.get_mock_train_func(cuda_oom_bound=2, max_runnable_bs=1)
+
+        bs_search_algo = BsSearchAlgo(mock_train_func, 2, 1000)
+        assert bs_search_algo.find_big_enough_batch_size() == 2
+
     def test_find_big_enough_batch_size_gradient_zero(self):
         def mock_train_func(batch_size) -> int:
             if batch_size > 1000:
diff --git a/tests/unit/engine/hpo/test_hpo_api.py b/tests/unit/engine/hpo/test_hpo_api.py
index bcc71d8bc9a..8b24dffcf00 100644
--- a/tests/unit/engine/hpo/test_hpo_api.py
+++ b/tests/unit/engine/hpo/test_hpo_api.py
@@ -119,7 +119,7 @@ def mock_find_trial_file(mocker) -> MagicMock:
 
 @pytest.fixture()
 def hpo_config() -> HpoConfig:
-    return HpoConfig(metric_name="val/accuracy")
+    return HpoConfig(metric_name="val/accuracy", callbacks_to_exclude="UselessCallback")
 
 
 @pytest.fixture()
@@ -127,6 +127,19 @@ def mock_progress_update_callback() -> MagicMock:
     return MagicMock()
 
 
+class UsefullCallback:
+    pass
+
+
+class UselessCallback:
+    pass
+
+
+@pytest.fixture()
+def mock_callback() -> list:
+    return [UsefullCallback(), UselessCallback()]
+
+
 def test_execute_hpo(
     mock_engine: MagicMock,
     hpo_config: HpoConfig,
@@ -138,12 +151,14 @@ def test_execute_hpo(
     mock_get_best_hpo_weight: MagicMock,
     mock_find_trial_file: MagicMock,
     mock_progress_update_callback: MagicMock,
+    mock_callback: list,
 ):
+    hpo_config.progress_update_callback = mock_progress_update_callback
     best_config, best_hpo_weight = execute_hpo(
         engine=mock_engine,
         max_epochs=10,
         hpo_config=hpo_config,
-        progress_update_callback=mock_progress_update_callback,
+        callbacks=mock_callback,
     )
 
     # check hpo workdir exists
@@ -152,12 +167,16 @@ def test_execute_hpo(
     # check a case where progress_update_callback exists
     mock_thread.assert_called_once()
     assert mock_thread.call_args.kwargs["target"] == _update_hpo_progress
-    assert mock_thread.call_args.kwargs["args"][0] == mock_progress_update_callback
     assert mock_thread.call_args.kwargs["daemon"] is True
     mock_thread.return_value.start.assert_called_once()
     # check whether run_hpo_loop is called well
     mock_run_hpo_loop.assert_called_once()
     assert mock_run_hpo_loop.call_args.args[0] == mock_hpo_algo
+    # check UselessCallback is excluded
+    for callback in mock_run_hpo_loop.call_args.args[1].keywords["callbacks"]:
+        assert not isinstance(callback, UselessCallback)
+    # check origincal callback lists isn't changed.
+    assert len(mock_callback) == 2
     # print_result is called after HPO is done
     mock_hpo_algo.print_result.assert_called_once()
     # best_config and best_hpo_weight are returned well
diff --git a/tests/unit/engine/test_engine.py b/tests/unit/engine/test_engine.py
index 879987f19cc..3adcc5678d7 100644
--- a/tests/unit/engine/test_engine.py
+++ b/tests/unit/engine/test_engine.py
@@ -223,11 +223,7 @@ def test_exporting(self, fxt_engine, mocker) -> None:
         checkpoint = "path/to/checkpoint.ckpt"
         fxt_engine.checkpoint = checkpoint
         fxt_engine.export()
-        mock_load_from_checkpoint.assert_called_once_with(
-            checkpoint_path=checkpoint,
-            map_location="cpu",
-            **fxt_engine.model.hparams,
-        )
+        mock_load_from_checkpoint.assert_called_once_with(checkpoint_path=checkpoint, map_location="cpu")
         mock_export.assert_called_once_with(
             output_dir=Path(fxt_engine.work_dir),
             base_name="exported_model",