From 80454808b38727e358e8b880043eeac0f18152fb Mon Sep 17 00:00:00 2001
From: Jaeguk Hyun <jaeguk.hyun@intel.com>
Date: Thu, 31 Aug 2023 10:57:26 +0900
Subject: [PATCH] Add a new object detector, Lite-DINO (#2457)

* Initial implementation of Lite DETR

* Update model config for lite dino

* Add norm to intermediate layer of ffn

* Change FFN's norm order and add enc_scale attribute to encoder's layers

* Merge with incremental recipe

* Add model pretrained weight path

* Update model info and add intg tests

* Update docs

* Update CHANGELOG

* Change num iters
---
 CHANGELOG.md                                  |   1 +
 .../object_detection/object_detection.rst     |   5 +
 .../multi_scale_deformable_attn_pytorch.py    |  17 +-
 .../mmdet/models/detectors/__init__.py        |   2 +
 .../models/detectors/custom_lite_dino.py      |  21 +
 .../adapters/mmdet/models/layers/__init__.py  |  10 +-
 .../mmdet/models/layers/lite_detr_layers.py   | 395 ++++++++++++++++++
 .../detection/resnet50_lite_dino/__init__.py  |   3 +
 .../resnet50_lite_dino/data_pipeline.py       |   4 +
 .../resnet50_lite_dino/deployment.py          |  12 +
 .../detection/resnet50_lite_dino/model.py     | 120 ++++++
 .../template_experimental.yaml                |  64 +++
 .../cli/detection/test_detection.py           |   3 +
 13 files changed, 648 insertions(+), 9 deletions(-)
 create mode 100644 src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_lite_dino.py
 create mode 100644 src/otx/algorithms/detection/adapters/mmdet/models/layers/lite_detr_layers.py
 create mode 100644 src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/__init__.py
 create mode 100644 src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/data_pipeline.py
 create mode 100644 src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/deployment.py
 create mode 100644 src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/model.py
 create mode 100644 src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/template_experimental.yaml

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cc41cc1c634..1b7c2d1e0d7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@ All notable changes to this project will be documented in this file.
 - Add ONNX metadata to detection, instance segmantation, and segmentation models (<https://github.com/openvinotoolkit/training_extensions/pull/2418>)
 - Add a new feature to configure input size(<https://github.com/openvinotoolkit/training_extensions/pull/2420>)
 - Introduce the OTXSampler and AdaptiveRepeatDataHook to achieve faster training at the small data regime (<https://github.com/openvinotoolkit/training_extensions/pull/2428>)
+- Add a new object detector Lite-DINO(<https://github.com/openvinotoolkit/training_extensions/pull/2457>)
 
 ### Enhancements
 
diff --git a/docs/source/guide/explanation/algorithms/object_detection/object_detection.rst b/docs/source/guide/explanation/algorithms/object_detection/object_detection.rst
index 9d7aec97058..e629d390d9e 100644
--- a/docs/source/guide/explanation/algorithms/object_detection/object_detection.rst
+++ b/docs/source/guide/explanation/algorithms/object_detection/object_detection.rst
@@ -100,6 +100,8 @@ In addition to these models, we supports experimental models for object detectio
 +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+
 | `Custom_Object_Detection_Gen3_DINO <https://github.com/openvinotoolkit/training_extensions/blob/develop/src/otx/algorithms/detection/configs/detection/resnet50_dino/template_experimental.yaml>`_                        |        DINO         | 235                 | 182.0           |
 +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+
+| `Custom_Object_Detection_Gen3_Lite_DINO <https://github.com/openvinotoolkit/training_extensions/blob/develop/src/otx/algorithms/detection/configs/detection/resnet50_litedino/template_experimental.yaml>`_               |      Lite-DINO      | 140                 | 190.0           |
++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+
 | `Custom_Object_Detection_Gen3_ResNeXt101_ATSS <https://github.com/openvinotoolkit/training_extensions/blob/develop/src/otx/algorithms/detection/configs/detection/resnext101_atss/template_experimental.yaml>`_           |   ResNeXt101-ATSS   | 434.75              | 344.0           |
 +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+
 | `Object_Detection_YOLOX_S <https://github.com/openvinotoolkit/training_extensions/blob/develop/src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_s/template_experimental.yaml>`_                            |       YOLOX_S       | 33.51               | 46.0            |
@@ -110,6 +112,7 @@ In addition to these models, we supports experimental models for object detectio
 +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+
 
 `Deformable_DETR <https://arxiv.org/abs/2010.04159>`_ is `DETR <https://arxiv.org/abs/2005.12872>`_ based model, and it solves slow convergence problem of DETR. `DINO <https://arxiv.org/abs/2203.03605>`_ improves Deformable DETR based methods via denoising anchor boxes. Current SOTA models for object detection are based on DINO.
+`Lite-DINO <https://arxiv.org/abs/2303.07335>`_ is efficient structure for DINO. It reduces FLOPS of transformer's encoder which takes the highest computational costs.
 Although transformer based models show notable performance on various object detection benchmark, CNN based model still show good performance with proper latency.
 Therefore, we added a new experimental CNN based method, ResNeXt101-ATSS. ATSS still shows good performance among `RetinaNet <https://arxiv.org/abs/1708.02002>`_ based models. We integrated large ResNeXt101 backbone to our Custom ATSS head, and it shows good transfer learning performance.
 In addition, we added a YOLOX variants to support users' diverse situations.
@@ -154,6 +157,8 @@ We trained each model with a single Nvidia GeForce RTX3090.
 +----------------------------+------------------+-----------+-----------+-----------+-----------+--------------+
 | ResNet50-DINO              | 49.0 (66.4)      | 47.2      | 99.5      | 62.9      | 93.5      | 99.1         |
 +----------------------------+------------------+-----------+-----------+-----------+-----------+--------------+
+| ResNet50-Lite-DINO         | 48.1 (64.4)      | 47.0      | 99.0      | 62.5      | 93.6      | 99.4         |
++----------------------------+------------------+-----------+-----------+-----------+-----------+--------------+
 | YOLOX_S                    | 40.3 (59.1)      | 37.1      | 93.6      | 54.8      | 92.7      | 98.8         |
 +----------------------------+------------------+-----------+-----------+-----------+-----------+--------------+
 | YOLOX_L                    | 49.4 (67.1)      | 44.5      | 94.6      | 55.8      | 91.8      | 99.0         |
diff --git a/src/otx/algorithms/common/adapters/mmcv/ops/multi_scale_deformable_attn_pytorch.py b/src/otx/algorithms/common/adapters/mmcv/ops/multi_scale_deformable_attn_pytorch.py
index a2f4d796731..025f16f3287 100644
--- a/src/otx/algorithms/common/adapters/mmcv/ops/multi_scale_deformable_attn_pytorch.py
+++ b/src/otx/algorithms/common/adapters/mmcv/ops/multi_scale_deformable_attn_pytorch.py
@@ -78,6 +78,7 @@ def _custom_grid_sample(im: torch.Tensor, grid: torch.Tensor, align_corners: boo
     Returns:
         torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg)
     """
+    device = im.device
     n, c, h, w = im.shape
     gn, gh, gw, _ = grid.shape
     assert n == gn
@@ -113,14 +114,14 @@ def _custom_grid_sample(im: torch.Tensor, grid: torch.Tensor, align_corners: boo
     x0, x1, y0, y1 = x0 + 1, x1 + 1, y0 + 1, y1 + 1
 
     # Clip coordinates to padded image size
-    x0 = torch.where(x0 < 0, torch.tensor(0), x0)
-    x0 = torch.where(x0 > padded_w - 1, torch.tensor(padded_w - 1), x0)
-    x1 = torch.where(x1 < 0, torch.tensor(0), x1)
-    x1 = torch.where(x1 > padded_w - 1, torch.tensor(padded_w - 1), x1)
-    y0 = torch.where(y0 < 0, torch.tensor(0), y0)
-    y0 = torch.where(y0 > padded_h - 1, torch.tensor(padded_h - 1), y0)
-    y1 = torch.where(y1 < 0, torch.tensor(0), y1)
-    y1 = torch.where(y1 > padded_h - 1, torch.tensor(padded_h - 1), y1)
+    x0 = torch.where(x0 < 0, torch.tensor(0).to(device), x0)
+    x0 = torch.where(x0 > padded_w - 1, torch.tensor(padded_w - 1).to(device), x0)
+    x1 = torch.where(x1 < 0, torch.tensor(0).to(device), x1)
+    x1 = torch.where(x1 > padded_w - 1, torch.tensor(padded_w - 1).to(device), x1)
+    y0 = torch.where(y0 < 0, torch.tensor(0).to(device), y0)
+    y0 = torch.where(y0 > padded_h - 1, torch.tensor(padded_h - 1).to(device), y0)
+    y1 = torch.where(y1 < 0, torch.tensor(0).to(device), y1)
+    y1 = torch.where(y1 > padded_h - 1, torch.tensor(padded_h - 1).to(device), y1)
 
     im_padded = im_padded.view(n, c, -1)
 
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/__init__.py b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/__init__.py
index 0dc0e8e4079..6d1932436d4 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/__init__.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/__init__.py
@@ -6,6 +6,7 @@
 from .custom_atss_detector import CustomATSS
 from .custom_deformable_detr_detector import CustomDeformableDETR
 from .custom_dino_detector import CustomDINO
+from .custom_lite_dino import CustomLiteDINO
 from .custom_maskrcnn_detector import CustomMaskRCNN
 from .custom_maskrcnn_tile_optimized import CustomMaskRCNNTileOptimized
 from .custom_single_stage_detector import CustomSingleStageDetector
@@ -19,6 +20,7 @@
 __all__ = [
     "CustomATSS",
     "CustomDeformableDETR",
+    "CustomLiteDINO",
     "CustomDINO",
     "CustomMaskRCNN",
     "CustomSingleStageDetector",
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_lite_dino.py b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_lite_dino.py
new file mode 100644
index 00000000000..b2f973187bb
--- /dev/null
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_lite_dino.py
@@ -0,0 +1,21 @@
+"""OTX Lite-DINO Class for object detection."""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+from mmdet.models.builder import DETECTORS
+
+from otx.algorithms.common.utils.logger import get_logger
+from otx.algorithms.detection.adapters.mmdet.models.detectors import CustomDINO
+
+logger = get_logger()
+
+
+@DETECTORS.register_module()
+class CustomLiteDINO(CustomDINO):
+    """Custom Lite-DINO <https://arxiv.org/pdf/2303.07335.pdf> for object detection."""
+
+    def load_state_dict_pre_hook(self, model_classes, ckpt_classes, ckpt_dict, *args, **kwargs):
+        """Modify official lite dino version's weights before weight loading."""
+        super(CustomDINO, self).load_state_dict_pre_hook(model_classes, ckpt_classes, ckpt_dict, *args, *kwargs)
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/layers/__init__.py b/src/otx/algorithms/detection/adapters/mmdet/models/layers/__init__.py
index 4ded67b4b79..6dc878e1ce0 100644
--- a/src/otx/algorithms/detection/adapters/mmdet/models/layers/__init__.py
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/layers/__init__.py
@@ -5,5 +5,13 @@
 
 from .dino import CustomDINOTransformer
 from .dino_layers import CdnQueryGenerator, DINOTransformerDecoder
+from .lite_detr_layers import EfficientTransformerEncoder, EfficientTransformerLayer, SmallExpandFFN
 
-__all__ = ["CustomDINOTransformer", "DINOTransformerDecoder", "CdnQueryGenerator"]
+__all__ = [
+    "CustomDINOTransformer",
+    "DINOTransformerDecoder",
+    "CdnQueryGenerator",
+    "EfficientTransformerEncoder",
+    "EfficientTransformerLayer",
+    "SmallExpandFFN",
+]
diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/layers/lite_detr_layers.py b/src/otx/algorithms/detection/adapters/mmdet/models/layers/lite_detr_layers.py
new file mode 100644
index 00000000000..af7d57a7497
--- /dev/null
+++ b/src/otx/algorithms/detection/adapters/mmdet/models/layers/lite_detr_layers.py
@@ -0,0 +1,395 @@
+"""Layers for Lite-DETR."""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import copy
+import warnings
+
+import torch
+from mmcv.cnn import Linear, build_norm_layer
+from mmcv.cnn.bricks.registry import FEEDFORWARD_NETWORK, TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE
+from mmcv.cnn.bricks.transformer import FFN, BaseTransformerLayer, build_transformer_layer
+from mmcv.runner.base_module import BaseModule, Sequential
+from torch import nn
+
+
+@FEEDFORWARD_NETWORK.register_module()
+class SmallExpandFFN(FFN):
+    """Implements feed-forward networks (FFNs) with small expand.
+
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`. Defaults: 256.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 1024.
+        num_fcs (int, optional): The number of fully-connected layers in
+            FFNs. Default: 2.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='ReLU')
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        add_identity (bool, optional): Whether to add the
+            identity connection. Default: `True`.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(
+        self,
+        embed_dims=256,
+        feedforward_channels=1024,
+        num_fcs=2,
+        act_cfg=dict(type="ReLU", inplace=True),
+        ffn_drop=0.0,
+        dropout_layer=None,
+        add_identity=True,
+        init_cfg=None,
+        **kwargs,
+    ):
+        super().__init__(
+            embed_dims,
+            feedforward_channels,
+            num_fcs,
+            act_cfg,
+            ffn_drop,
+            dropout_layer,
+            add_identity,
+            init_cfg,
+            **kwargs,
+        )
+
+        layers = []
+        for _ in range(num_fcs - 1):
+            layers.append(Sequential(Linear(embed_dims, embed_dims), self.activate, nn.Dropout(ffn_drop)))
+        layers.append(Linear(embed_dims, embed_dims))
+        layers.append(nn.Dropout(ffn_drop))
+        self.small_expand_layers = Sequential(*layers)
+
+        self.norm1 = nn.LayerNorm(embed_dims)
+        self.norm2 = nn.LayerNorm(embed_dims)
+
+    def forward(self, x, level_start_index, enc_scale, identity=None):
+        """Forward function for FFN."""
+        x_3s = x[level_start_index[4 - enc_scale] :]
+        x_4s = x[: level_start_index[4 - enc_scale]]
+        x_4s = self.forward_ffn(self.small_expand_layers, self.norm2, x_4s, identity)
+        x_3s = self.forward_ffn(self.layers, self.norm1, x_3s, identity)
+        x = torch.cat([x_4s, x_3s], 0)
+
+        return x
+
+    def forward_ffn(self, layers, norm, x, identity=None):
+        """Forward Feed Forward Network given layers."""
+        out = layers(x)
+        if not self.add_identity:
+            return self.dropout_layer(out)
+        if identity is None:
+            identity = x
+        return norm(identity + self.dropout_layer(out))
+
+
+@TRANSFORMER_LAYER.register_module()
+class EfficientTransformerLayer(BaseTransformerLayer):
+    """Efficient TransformerLayer for Lite-DETR.
+
+    It is base transformer encoder layer for Lite-DETR <https://arxiv.org/pdf/2303.07335.pdf>`_ .
+
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for `self_attention` or `cross_attention` modules,
+            The order of the configs in the list should be consistent with
+            corresponding attentions in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config. Default: None.
+        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for FFN, The order of the configs in the list should be
+            consistent with corresponding ffn in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Support `prenorm` when you specifying first element as `norm`.
+            Default：None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape
+            of (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        enc_scale (int): Scale of high level features. Default is 3.
+    """
+
+    def __init__(
+        self,
+        small_expand=False,
+        attn_cfgs=None,
+        ffn_cfgs=dict(
+            type="FFN",
+            embed_dims=256,
+            feedforward_channels=1024,
+            num_fcs=2,
+            ffn_drop=0.0,
+            act_cfg=dict(type="ReLU", inplace=True),
+        ),
+        operation_order=None,
+        norm_cfg=dict(type="LN"),
+        init_cfg=None,
+        batch_first=False,
+        enc_scale=3,
+        **kwargs,
+    ):
+
+        super().__init__(attn_cfgs, ffn_cfgs, operation_order, norm_cfg, init_cfg, batch_first, **kwargs)
+        self.enc_scale = enc_scale
+        self.small_expand = small_expand
+
+    def forward(
+        self,
+        query,
+        key=None,
+        value=None,
+        query_pos=None,
+        key_pos=None,
+        attn_masks=None,
+        query_key_padding_mask=None,
+        key_padding_mask=None,
+        level_start_index=None,
+        **kwargs,
+    ):
+        """Forward function for `TransformerDecoderLayer`.
+
+        **kwargs contains some specific arguments of attentions.
+
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+            level_start_index (Tensor): Start index for each level.
+            kwargs: Additional arguments.
+
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [copy.deepcopy(attn_masks) for _ in range(self.num_attn)]
+            warnings.warn(f"Use same attn_mask in all attentions in " f"{self.__class__.__name__} ")
+        else:
+            assert len(attn_masks) == self.num_attn, (
+                f"The length of "
+                f"attn_masks {len(attn_masks)} must be equal "
+                f"to the number of attention in "
+                f"operation_order {self.num_attn}"
+            )
+
+        for layer in self.operation_order:
+            if layer == "self_attn":
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=query_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    level_start_index=level_start_index,
+                    **kwargs,
+                )
+                attn_index += 1
+                identity = query
+
+            elif layer == "norm":
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            elif layer == "cross_attn":
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    level_start_index=level_start_index,
+                    **kwargs,
+                )
+                attn_index += 1
+                identity = query
+
+            elif layer == "ffn":
+                if self.small_expand:
+                    query = self.ffns[ffn_index](
+                        query, level_start_index, self.enc_scale, identity if self.pre_norm else None
+                    )
+                else:
+                    query = self.ffns[ffn_index](query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class EfficientTransformerEncoder(BaseModule):
+    """TransformerEncoder of Lite-DETR.
+
+    Args:
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`. Only used when `self.pre_norm` is `True`
+    """
+
+    def __init__(
+        self,
+        transformerlayers=None,
+        num_layers=None,
+        init_cfg=None,
+        post_norm_cfg=dict(type="LN"),
+        enc_scale=3,
+        num_expansion=3,
+        **kwargs,
+    ):
+        super().__init__(init_cfg)
+        if len(transformerlayers) == 2 and num_layers != 2:
+            if num_expansion == 1:
+                _transformerlayers = [copy.deepcopy(transformerlayers[0]) for _ in range(num_layers - 1)] + [
+                    transformerlayers[1]
+                ]
+            else:
+                _transformerlayers = []
+                for i in range(num_expansion):
+                    for j in range(int(num_layers / num_expansion) - 1):
+                        _transformerlayers.append(copy.deepcopy(transformerlayers[0]))
+                    _transformerlayers.append(copy.deepcopy(transformerlayers[1]))
+        else:
+            assert isinstance(transformerlayers, list) and len(transformerlayers) == num_layers
+        self.num_layers = num_layers
+        self.layers = nn.ModuleList()
+        for layer in _transformerlayers:
+            layer = build_transformer_layer(layer)
+            assert layer.enc_scale == enc_scale
+            self.layers.append(layer)
+        self.embed_dims = self.layers[0].embed_dims
+        self.pre_norm = self.layers[0].pre_norm
+        self.num_expansion = num_expansion
+        self.enc_scale = enc_scale
+
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(post_norm_cfg, self.embed_dims)[1] if self.pre_norm else None
+        else:
+            assert not self.pre_norm, f"Use prenorm in " f"{self.__class__.__name__}," f"Please specify post_norm_cfg"
+            self.post_norm = None
+
+    def forward(
+        self,
+        query,
+        key,
+        value,
+        query_pos=None,
+        key_pos=None,
+        attn_masks=None,
+        query_key_padding_mask=None,
+        key_padding_mask=None,
+        level_start_index=None,
+        reference_points=None,
+        **kwargs,
+    ):
+        """Forward function for `TransformerCoder`.
+
+        Args:
+            query (Tensor): Input query with shape
+                `(num_queries, bs, embed_dims)`.
+            key (Tensor): The key tensor with shape
+                `(num_keys, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_keys, bs, embed_dims)`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor], optional): Each element is 2D Tensor
+                which is used in calculation of corresponding attention in
+                operation_order. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in self-attention
+                Default: None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+            level_start_index (Tensor): Start index for each level.
+            reference_points (Tensor): BBox predictions' reference.
+            kwargs: Additional arguments.
+
+        Returns:
+            Tensor:  results with shape [num_queries, bs, embed_dims].
+        """
+        value = query
+        value_tgt = value[level_start_index[4 - self.enc_scale] :]
+        query = value_tgt
+        reference_points_tgt = reference_points[:, level_start_index[4 - self.enc_scale] :]
+        query_pos_tgt = query_pos[level_start_index[4 - self.enc_scale] :]
+        for layer_id, layer in enumerate(self.layers):
+            if (layer_id + 1) % (self.num_layers / self.num_expansion) == 0:
+                query = value
+                output = layer(
+                    query,
+                    key,
+                    value,
+                    query_pos=query_pos,
+                    reference_points=reference_points,
+                    level_start_index=level_start_index,
+                    key_pos=key_pos,
+                    attn_masks=attn_masks,
+                    query_key_padding_mask=query_key_padding_mask,
+                    key_padding_mask=key_padding_mask,
+                    **kwargs,
+                )
+                query = output[level_start_index[4 - self.enc_scale] :]
+                value = output
+            else:
+                output = layer(
+                    query,
+                    key,
+                    value,
+                    query_pos=query_pos_tgt,
+                    reference_points=reference_points_tgt,
+                    level_start_index=level_start_index,
+                    key_pos=key_pos,
+                    attn_masks=attn_masks,
+                    query_key_padding_mask=query_key_padding_mask,
+                    key_padding_mask=key_padding_mask,
+                    **kwargs,
+                )
+                query = output
+                value = torch.cat([value[: level_start_index[4 - self.enc_scale]], query], 0)
+        if self.post_norm is not None:
+            output = self.post_norm(output)
+        return value
diff --git a/src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/__init__.py b/src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/__init__.py
new file mode 100644
index 00000000000..6ed610c151c
--- /dev/null
+++ b/src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/__init__.py
@@ -0,0 +1,3 @@
+"""Initialization of Lite DINO for OTX Detection."""
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/data_pipeline.py b/src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/data_pipeline.py
new file mode 100644
index 00000000000..d353a35bbaf
--- /dev/null
+++ b/src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/data_pipeline.py
@@ -0,0 +1,4 @@
+"""Data pipeline for Lite DINO."""
+
+
+_base_ = ["../../base/data/detr_data_pipeline.py"]
diff --git a/src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/deployment.py b/src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/deployment.py
new file mode 100644
index 00000000000..f9f8653afc4
--- /dev/null
+++ b/src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/deployment.py
@@ -0,0 +1,12 @@
+"""MMDeploy config of Lite DINO model for Detection Task."""
+
+_base_ = ["../../base/deployments/base_detection_dynamic.py"]
+
+ir_config = dict(
+    output_names=["boxes", "labels"],
+    opset_version=16,
+)
+
+backend_config = dict(
+    model_inputs=[dict(opt_shapes=dict(input=[-1, 3, 800, 1333]))],
+)
diff --git a/src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/model.py b/src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/model.py
new file mode 100644
index 00000000000..07789654825
--- /dev/null
+++ b/src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/model.py
@@ -0,0 +1,120 @@
+"""Model config for Lite DINO."""
+_base_ = [
+    "../../../../../recipes/stages/detection/incremental.py",
+]
+model = dict(
+    type="CustomLiteDINO",
+    backbone=dict(
+        type="ResNet",
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type="BN", requires_grad=False),
+        norm_eval=True,
+        style="pytorch",
+        init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet50"),
+    ),
+    neck=dict(
+        type="ChannelMapper",
+        in_channels=[512, 1024, 2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type="GN", num_groups=32),
+        num_outs=4,
+    ),
+    bbox_head=dict(
+        type="CustomDINOHead",
+        num_query=900,
+        num_classes=80,
+        in_channels=2048,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=True,
+        transformer=dict(
+            type="CustomDINOTransformer",
+            encoder=dict(
+                type="EfficientTransformerEncoder",
+                num_expansion=3,
+                enc_scale=1,
+                num_layers=6,
+                transformerlayers=[
+                    dict(
+                        type="EfficientTransformerLayer",
+                        enc_scale=1,
+                        attn_cfgs=dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0),
+                        feedforward_channels=2048,
+                        ffn_dropout=0.0,
+                        operation_order=("self_attn", "norm", "ffn", "norm"),
+                    ),
+                    dict(
+                        type="EfficientTransformerLayer",
+                        enc_scale=1,
+                        small_expand=True,
+                        attn_cfgs=dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0),
+                        ffn_cfgs=dict(
+                            type="SmallExpandFFN",
+                            embed_dims=256,
+                            feedforward_channels=1024,
+                            num_fcs=2,
+                            ffn_drop=0.0,
+                            act_cfg=dict(type="ReLU", inplace=True),
+                        ),
+                        feedforward_channels=2048,
+                        ffn_dropout=0.0,
+                        operation_order=("self_attn", "norm", "ffn"),
+                    ),
+                ],
+            ),
+            decoder=dict(
+                type="DINOTransformerDecoder",
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type="DetrTransformerDecoderLayer",
+                    attn_cfgs=[
+                        dict(type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.0),
+                        dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0),
+                    ],
+                    feedforward_channels=2048,
+                    ffn_dropout=0.0,
+                    operation_order=("self_attn", "norm", "cross_attn", "norm", "ffn", "norm"),
+                ),
+            ),
+        ),
+        positional_encoding=dict(
+            type="SinePositionalEncoding", num_feats=128, normalize=True, offset=0.0, temperature=20
+        ),
+        loss_cls=dict(type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0),
+        loss_bbox=dict(type="L1Loss", loss_weight=5.0),
+        loss_iou=dict(type="GIoULoss", loss_weight=2.0),
+        dn_cfg=dict(
+            label_noise_scale=0.5,
+            box_noise_scale=1.0,  # 0.4 for DN-DETR
+            group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100),
+        ),
+    ),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type="HungarianAssigner",
+            cls_cost=dict(type="FocalLossCost", weight=1.0),
+            reg_cost=dict(type="BBoxL1Cost", weight=5.0, box_format="xywh"),
+            iou_cost=dict(type="IoUCost", iou_mode="giou", weight=2.0),
+        )
+    ),
+    test_cfg=dict(max_per_img=300),
+)
+# optimizer
+optimizer = dict(
+    _delete_=True,
+    type="AdamW",
+    lr=1e-4,
+    weight_decay=0.0001,
+)
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+load_from = "https://storage.openvinotoolkit.org/repositories/openvino_training_extensions/\
+models/object_detection/v2/lite-dino-coco.pth"
+resume_from = None
+ignore = False
diff --git a/src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/template_experimental.yaml b/src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/template_experimental.yaml
new file mode 100644
index 00000000000..34692cd9a36
--- /dev/null
+++ b/src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/template_experimental.yaml
@@ -0,0 +1,64 @@
+# Description.
+model_template_id: Custom_Object_Detection_Gen3_Lite_DINO
+name: Lite-DINO
+task_type: DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Object Detection for Lite DINO
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Task implementations.
+entrypoints:
+  base: otx.algorithms.detection.adapters.mmdet.task.MMDetectionTask
+  openvino: otx.algorithms.detection.adapters.openvino.task.OpenVINODetectionTask
+  nncf: otx.algorithms.detection.adapters.mmdet.nncf.task.DetectionNNCFTask
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 4
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.0001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 0
+      num_iters:
+        default_value: 200
+    nncf_optimization:
+      enable_quantization:
+        default_value: true
+      enable_pruning:
+        default_value: false
+      pruning_supported:
+        default_value: true
+      maximal_accuracy_degradation:
+        default_value: 1.0
+    algo_backend:
+      train_type:
+        default_value: Incremental
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 140
+size: 192.0
+# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform.
+# inference_targets:
+#   - CPU
+#   - GPU
+#   - VPU
diff --git a/tests/integration/cli/detection/test_detection.py b/tests/integration/cli/detection/test_detection.py
index 287a1c936bd..ffbbb6f62f9 100644
--- a/tests/integration/cli/detection/test_detection.py
+++ b/tests/integration/cli/detection/test_detection.py
@@ -74,6 +74,9 @@
         "src/otx/algorithms/detection/configs/detection/resnet50_deformable_detr/template_experimental.yaml"
     ),
     parse_model_template("src/otx/algorithms/detection/configs/detection/resnet50_dino/template_experimental.yaml"),
+    parse_model_template(
+        "src/otx/algorithms/detection/configs/detection/resnet50_lite_dino/template_experimental.yaml"
+    ),
     parse_model_template("src/otx/algorithms/detection/configs/detection/resnext101_atss/template_experimental.yaml"),
     parse_model_template(
         "src/otx/algorithms/detection/configs/detection/cspdarknet_yolox_s/template_experimental.yaml"