From 85f033a50fcc61c7aeb62704cdecbac4acfcc4e8 Mon Sep 17 00:00:00 2001
From: "Kang, Harim" <harim.kang@intel.com>
Date: Mon, 20 Mar 2023 13:40:59 +0900
Subject: [PATCH 1/5] Add mmcls transformer backbones

---
 otx/algorithms/classification/configs/configuration.yaml | 2 +-
 otx/algorithms/common/configs/training_base.py           | 2 +-
 otx/cli/builder/supported_backbone/mmcls.json            | 8 ++++----
 otx/mpa/cls/stage.py                                     | 7 +++++++
 4 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/otx/algorithms/classification/configs/configuration.yaml b/otx/algorithms/classification/configs/configuration.yaml
index 897c3f7e13f..dd2a93c51a0 100644
--- a/otx/algorithms/classification/configs/configuration.yaml
+++ b/otx/algorithms/classification/configs/configuration.yaml
@@ -10,7 +10,7 @@ learning_parameters:
       stable. A larger batch size has higher memory requirements.
     editable: true
     header: Batch size
-    max_value: 512
+    max_value: 2048
     min_value: 1
     type: INTEGER
     ui_rules:
diff --git a/otx/algorithms/common/configs/training_base.py b/otx/algorithms/common/configs/training_base.py
index 1e99f5048ee..4c397554c54 100644
--- a/otx/algorithms/common/configs/training_base.py
+++ b/otx/algorithms/common/configs/training_base.py
@@ -65,7 +65,7 @@ class BaseLearningParameters(ParameterGroup):
         batch_size = configurable_integer(
             default_value=5,
             min_value=1,
-            max_value=512,
+            max_value=2048,
             header="Batch size",
             description="The number of training samples seen in each iteration of training. Increasing thisvalue "
             "improves training time and may make the training more stable. A larger batch size has higher "
diff --git a/otx/cli/builder/supported_backbone/mmcls.json b/otx/cli/builder/supported_backbone/mmcls.json
index 6b5f1343a2e..a63bfc636b3 100644
--- a/otx/cli/builder/supported_backbone/mmcls.json
+++ b/otx/cli/builder/supported_backbone/mmcls.json
@@ -11,7 +11,7 @@
       "options": {
         "arch": ["tiny", "small", "base"]
       },
-      "available": []
+      "available": ["CLASSIFICATION"]
     },
     "mmcls.ConvMixer": {
       "required": ["arch"],
@@ -287,7 +287,7 @@
     "mmcls.T2T_ViT": {
       "required": [],
       "options": {},
-      "available": []
+      "available": ["CLASSIFICATION"]
     },
     "mmcls.TIMMBackbone": {
       "required": ["model_name"],
@@ -299,7 +299,7 @@
       "options": {
         "arch": ["base", "small"]
       },
-      "available": []
+      "available": ["CLASSIFICATION"]
     },
     "mmcls.PCPVT": {
       "required": ["arch"],
@@ -341,7 +341,7 @@
           "deit-base"
         ]
       },
-      "available": []
+      "available": ["CLASSIFICATION"]
     }
   }
 }
diff --git a/otx/mpa/cls/stage.py b/otx/mpa/cls/stage.py
index d24abbe12fd..08d5d051eb8 100644
--- a/otx/mpa/cls/stage.py
+++ b/otx/mpa/cls/stage.py
@@ -15,6 +15,8 @@
 
 logger = get_logger()
 
+TRANSFORMER_BACKBONES = ["VisionTransformer", "T2T_ViT", "TNT", "Conformer"]
+
 
 class ClsStage(Stage):
     MODEL_BUILDER = build_classifier
@@ -89,6 +91,11 @@ def configure_in_channel(cfg):
         output = layer(torch.rand([1] + list(input_shape)))
         if isinstance(output, (tuple, list)):
             output = output[-1]
+
+        if layer.__class__.__name__ in TRANSFORMER_BACKBONES:
+            # mmcls.VisionTransformer outputs Tuple[List[...]] and the last index of List is the final logit.
+            _, output = output
+
         in_channels = output.shape[1]
         if cfg.model.get("neck") is not None:
             if cfg.model.neck.get("in_channels") is not None:

From 3b0c1488af95f2b9d05264bee4a3f4dbedb1c1d8 Mon Sep 17 00:00:00 2001
From: "Kang, Harim" <harim.kang@intel.com>
Date: Mon, 20 Mar 2023 14:21:36 +0900
Subject: [PATCH 2/5] Fix VisionTransformeroutput check

---
 otx/mpa/cls/stage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/otx/mpa/cls/stage.py b/otx/mpa/cls/stage.py
index 08d5d051eb8..eeefaa14db0 100644
--- a/otx/mpa/cls/stage.py
+++ b/otx/mpa/cls/stage.py
@@ -92,7 +92,7 @@ def configure_in_channel(cfg):
         if isinstance(output, (tuple, list)):
             output = output[-1]
 
-        if layer.__class__.__name__ in TRANSFORMER_BACKBONES:
+        if layer.__class__.__name__ in TRANSFORMER_BACKBONES and isinstance(output, (tuple, list)):
             # mmcls.VisionTransformer outputs Tuple[List[...]] and the last index of List is the final logit.
             _, output = output
 

From 86baffcde4f6859a8d47a0a4888bf216162dbcf8 Mon Sep 17 00:00:00 2001
From: "Kang, Harim" <harim.kang@intel.com>
Date: Mon, 20 Mar 2023 22:03:13 +0900
Subject: [PATCH 3/5] Add changes

---
 otx/algorithms/__init__.py                       | 2 ++
 otx/cli/builder/builder.py                       | 7 +++++++
 otx/mpa/cls/inferrer.py                          | 4 ++++
 otx/mpa/cls/stage.py                             | 5 +++--
 otx/mpa/modules/hooks/recording_forward_hooks.py | 9 ++++++---
 5 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/otx/algorithms/__init__.py b/otx/algorithms/__init__.py
index 3d087f538e4..5e6c32a5e1d 100644
--- a/otx/algorithms/__init__.py
+++ b/otx/algorithms/__init__.py
@@ -2,3 +2,5 @@
 
 # Copyright (C) 2022 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
+
+TRANSFORMER_BACKBONES = ["VisionTransformer", "T2T_ViT", "TNT", "Conformer"]
diff --git a/otx/cli/builder/builder.py b/otx/cli/builder/builder.py
index 5adfe235a96..1c3aee71329 100644
--- a/otx/cli/builder/builder.py
+++ b/otx/cli/builder/builder.py
@@ -29,6 +29,7 @@
 from torch import nn
 
 from otx.api.entities.model_template import TaskType
+from otx.algorithms import TRANSFORMER_BACKBONES
 from otx.cli.utils.importing import (
     get_backbone_list,
     get_backbone_registry,
@@ -212,6 +213,12 @@ def merge_backbone(
             out_channels = -1
             if hasattr(model_config.model, "head"):
                 model_config.model.head.in_channels = -1
+            # TODO: This is a hard coded part of the Transformer backbone and needs to be refactored.
+            if backend == "mmcls" and backbone_class in TRANSFORMER_BACKBONES:
+                if hasattr(model_config.model, "neck"):
+                    model_config.model.neck = None
+                if hasattr(model_config.model, "head"):
+                    model_config.model.head["type"] = "VisionTransformerClsHead"
         else:
             # Need to update in/out channel configuration here
             out_channels = get_backbone_out_channels(backbone)
diff --git a/otx/mpa/cls/inferrer.py b/otx/mpa/cls/inferrer.py
index 17336bf7cd4..3c511d91212 100644
--- a/otx/mpa/cls/inferrer.py
+++ b/otx/mpa/cls/inferrer.py
@@ -11,6 +11,7 @@
 from mmcls.datasets import build_dataset as mmcls_build_dataset
 from mmcv import Config, ConfigDict
 
+from otx.algorithms import TRANSFORMER_BACKBONES
 from otx.algorithms.common.adapters.mmcv.utils import (
     build_data_parallel,
     build_dataloader,
@@ -53,6 +54,9 @@ def run(self, model_cfg, model_ckpt, data_cfg, **kwargs):
         model_builder = kwargs.get("model_builder", None)
         dump_features = kwargs.get("dump_features", False)
         dump_saliency_map = kwargs.get("dump_saliency_map", False)
+        # TODO: It looks like we need to modify that code in an appropriate way.
+        if model_cfg.model.head.get("type", None) == "VisionTransformerClsHead":
+            dump_saliency_map = False
         eval = kwargs.get("eval", False)
         outputs = self.infer(
             cfg,
diff --git a/otx/mpa/cls/stage.py b/otx/mpa/cls/stage.py
index eeefaa14db0..d12eb790e05 100644
--- a/otx/mpa/cls/stage.py
+++ b/otx/mpa/cls/stage.py
@@ -9,14 +9,13 @@
 from mmcv import ConfigDict, build_from_cfg
 
 from otx.algorithms.classification.adapters.mmcls.utils.builder import build_classifier
+from otx.algorithms import TRANSFORMER_BACKBONES
 from otx.mpa.stage import Stage
 from otx.mpa.utils.config_utils import recursively_update_cfg, update_or_add_custom_hook
 from otx.mpa.utils.logger import get_logger
 
 logger = get_logger()
 
-TRANSFORMER_BACKBONES = ["VisionTransformer", "T2T_ViT", "TNT", "Conformer"]
-
 
 class ClsStage(Stage):
     MODEL_BUILDER = build_classifier
@@ -95,6 +94,8 @@ def configure_in_channel(cfg):
         if layer.__class__.__name__ in TRANSFORMER_BACKBONES and isinstance(output, (tuple, list)):
             # mmcls.VisionTransformer outputs Tuple[List[...]] and the last index of List is the final logit.
             _, output = output
+            if cfg.model.head.type != "VisionTransformerClsHead":
+                raise ValueError(f"{layer.__class__.__name__ } needs VisionTransformerClsHead as head")
 
         in_channels = output.shape[1]
         if cfg.model.get("neck") is not None:
diff --git a/otx/mpa/modules/hooks/recording_forward_hooks.py b/otx/mpa/modules/hooks/recording_forward_hooks.py
index 4b3fc7011e2..0935ce8c9e6 100644
--- a/otx/mpa/modules/hooks/recording_forward_hooks.py
+++ b/otx/mpa/modules/hooks/recording_forward_hooks.py
@@ -20,6 +20,7 @@
 import torch
 
 from otx import MMCLS_AVAILABLE
+from otx.algorithms import TRANSFORMER_BACKBONES
 
 if MMCLS_AVAILABLE:
     from mmcls.models.necks.gap import GlobalAveragePooling
@@ -116,10 +117,12 @@ def func(feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx: int
 
 
 class FeatureVectorHook(BaseRecordingForwardHook):
-    @staticmethod
-    def func(feature_map: Union[torch.Tensor, Sequence[torch.Tensor]]) -> torch.Tensor:
+    def func(self, feature_map: Union[torch.Tensor, Sequence[torch.Tensor]]) -> torch.Tensor:
         """Generate the feature vector by average pooling feature maps."""
-        if isinstance(feature_map, (list, tuple)):
+        if self._module.backbone.__class__.__name__ in TRANSFORMER_BACKBONES and isinstance(feature_map[-1], (tuple, list)):
+            # mmcls.VisionTransformer outputs Tuple[List[...]] and the last index of List is the final logit.
+            feature_vector, _ = feature_map[-1]
+        elif isinstance(feature_map, (list, tuple)):
             # aggregate feature maps from Feature Pyramid Network
             feature_vector = [torch.nn.functional.adaptive_avg_pool2d(f, (1, 1)) for f in feature_map]
             feature_vector = torch.cat(feature_vector, 1)

From d300f7ca2f94b3a42452c1f310a85f92b35febba Mon Sep 17 00:00:00 2001
From: "Kang, Harim" <harim.kang@intel.com>
Date: Mon, 20 Mar 2023 22:22:27 +0900
Subject: [PATCH 4/5] Disable recording forward hooks in inferrer

---
 otx/algorithms/__init__.py                       | 2 +-
 otx/cli/builder/builder.py                       | 6 +++---
 otx/cli/builder/supported_backbone/mmcls.json    | 2 +-
 otx/mpa/cls/inferrer.py                          | 1 +
 otx/mpa/cls/stage.py                             | 2 +-
 otx/mpa/modules/hooks/recording_forward_hooks.py | 8 +++-----
 6 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/otx/algorithms/__init__.py b/otx/algorithms/__init__.py
index 5e6c32a5e1d..daf814e52b2 100644
--- a/otx/algorithms/__init__.py
+++ b/otx/algorithms/__init__.py
@@ -3,4 +3,4 @@
 # Copyright (C) 2022 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-TRANSFORMER_BACKBONES = ["VisionTransformer", "T2T_ViT", "TNT", "Conformer"]
+TRANSFORMER_BACKBONES = ["VisionTransformer", "T2T_ViT", "Conformer"]
diff --git a/otx/cli/builder/builder.py b/otx/cli/builder/builder.py
index 1c3aee71329..aabff5429ab 100644
--- a/otx/cli/builder/builder.py
+++ b/otx/cli/builder/builder.py
@@ -28,8 +28,8 @@
 from mmcv.utils import Registry, build_from_cfg
 from torch import nn
 
-from otx.api.entities.model_template import TaskType
 from otx.algorithms import TRANSFORMER_BACKBONES
+from otx.api.entities.model_template import TaskType
 from otx.cli.utils.importing import (
     get_backbone_list,
     get_backbone_registry,
@@ -102,8 +102,8 @@ def update_backbone_args(backbone_config: dict, registry: Registry, backend: str
 
 def update_channels(model_config: MPAConfig, out_channels: Any):
     """Update in_channel of head or neck."""
-    if hasattr(model_config.model, "neck"):
-        if model_config.model.neck.type == "GlobalAveragePooling":
+    if hasattr(model_config.model, "neck") and model_config.model.neck:
+        if model_config.model.neck.get("type", None) == "GlobalAveragePooling":
             model_config.model.neck.pop("in_channels", None)
         else:
             print(f"\tUpdate model.neck.in_channels: {out_channels}")
diff --git a/otx/cli/builder/supported_backbone/mmcls.json b/otx/cli/builder/supported_backbone/mmcls.json
index a63bfc636b3..71f10692aa5 100644
--- a/otx/cli/builder/supported_backbone/mmcls.json
+++ b/otx/cli/builder/supported_backbone/mmcls.json
@@ -299,7 +299,7 @@
       "options": {
         "arch": ["base", "small"]
       },
-      "available": ["CLASSIFICATION"]
+      "available": []
     },
     "mmcls.PCPVT": {
       "required": ["arch"],
diff --git a/otx/mpa/cls/inferrer.py b/otx/mpa/cls/inferrer.py
index 3c511d91212..9c7e5770219 100644
--- a/otx/mpa/cls/inferrer.py
+++ b/otx/mpa/cls/inferrer.py
@@ -56,6 +56,7 @@ def run(self, model_cfg, model_ckpt, data_cfg, **kwargs):
         dump_saliency_map = kwargs.get("dump_saliency_map", False)
         # TODO: It looks like we need to modify that code in an appropriate way.
         if model_cfg.model.head.get("type", None) == "VisionTransformerClsHead":
+            dump_features = False
             dump_saliency_map = False
         eval = kwargs.get("eval", False)
         outputs = self.infer(
diff --git a/otx/mpa/cls/stage.py b/otx/mpa/cls/stage.py
index d12eb790e05..dd78acbfffa 100644
--- a/otx/mpa/cls/stage.py
+++ b/otx/mpa/cls/stage.py
@@ -8,8 +8,8 @@
 import torch
 from mmcv import ConfigDict, build_from_cfg
 
-from otx.algorithms.classification.adapters.mmcls.utils.builder import build_classifier
 from otx.algorithms import TRANSFORMER_BACKBONES
+from otx.algorithms.classification.adapters.mmcls.utils.builder import build_classifier
 from otx.mpa.stage import Stage
 from otx.mpa.utils.config_utils import recursively_update_cfg, update_or_add_custom_hook
 from otx.mpa.utils.logger import get_logger
diff --git a/otx/mpa/modules/hooks/recording_forward_hooks.py b/otx/mpa/modules/hooks/recording_forward_hooks.py
index 0935ce8c9e6..0a6a6cc2a63 100644
--- a/otx/mpa/modules/hooks/recording_forward_hooks.py
+++ b/otx/mpa/modules/hooks/recording_forward_hooks.py
@@ -117,12 +117,10 @@ def func(feature_map: Union[torch.Tensor, Sequence[torch.Tensor]], fpn_idx: int
 
 
 class FeatureVectorHook(BaseRecordingForwardHook):
-    def func(self, feature_map: Union[torch.Tensor, Sequence[torch.Tensor]]) -> torch.Tensor:
+    @staticmethod
+    def func(feature_map: Union[torch.Tensor, Sequence[torch.Tensor]]) -> torch.Tensor:
         """Generate the feature vector by average pooling feature maps."""
-        if self._module.backbone.__class__.__name__ in TRANSFORMER_BACKBONES and isinstance(feature_map[-1], (tuple, list)):
-            # mmcls.VisionTransformer outputs Tuple[List[...]] and the last index of List is the final logit.
-            feature_vector, _ = feature_map[-1]
-        elif isinstance(feature_map, (list, tuple)):
+        if isinstance(feature_map, (list, tuple)):
             # aggregate feature maps from Feature Pyramid Network
             feature_vector = [torch.nn.functional.adaptive_avg_pool2d(f, (1, 1)) for f in feature_map]
             feature_vector = torch.cat(feature_vector, 1)

From 631055474766018ea1506f4c4d2eb9ffe1219e4b Mon Sep 17 00:00:00 2001
From: "Kang, Harim" <harim.kang@intel.com>
Date: Mon, 20 Mar 2023 22:36:57 +0900
Subject: [PATCH 5/5] Remove unused import

---
 otx/mpa/modules/hooks/recording_forward_hooks.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/otx/mpa/modules/hooks/recording_forward_hooks.py b/otx/mpa/modules/hooks/recording_forward_hooks.py
index 0a6a6cc2a63..4b3fc7011e2 100644
--- a/otx/mpa/modules/hooks/recording_forward_hooks.py
+++ b/otx/mpa/modules/hooks/recording_forward_hooks.py
@@ -20,7 +20,6 @@
 import torch
 
 from otx import MMCLS_AVAILABLE
-from otx.algorithms import TRANSFORMER_BACKBONES
 
 if MMCLS_AVAILABLE:
     from mmcls.models.necks.gap import GlobalAveragePooling